1 /* $NetBSD: softfloat.c,v 1.8 2011/07/10 04:52:23 matt Exp $ */
2 
3 /*
4  * This version hacked for use with gcc -msoft-float by bjh21.
5  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6  *  itself).
7  */
8 
9 /*
10  * Things you may want to define:
11  *
12  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
14  *   properly renamed.
15  */
16 
17 /*
18 ===============================================================================
19 
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 #include <sys/cdefs.h>
48 __FBSDID("$FreeBSD$");
49 
50 #ifdef SOFTFLOAT_FOR_GCC
51 #include "softfloat-for-gcc.h"
52 #endif
53 
54 #include "milieu.h"
55 #include "softfloat.h"
56 
57 /*
58  * Conversions between floats as stored in memory and floats as
59  * SoftFloat uses them
60  */
61 #ifndef FLOAT64_DEMANGLE
62 #define FLOAT64_DEMANGLE(a)	(a)
63 #endif
64 #ifndef FLOAT64_MANGLE
65 #define FLOAT64_MANGLE(a)	(a)
66 #endif
67 
68 /*
69 -------------------------------------------------------------------------------
70 Floating-point rounding mode, extended double-precision rounding precision,
71 and exception flags.
72 -------------------------------------------------------------------------------
73 */
74 int float_rounding_mode = float_round_nearest_even;
75 int float_exception_flags = 0;
76 #ifdef FLOATX80
77 int8 floatx80_rounding_precision = 80;
78 #endif
79 
80 /*
81 -------------------------------------------------------------------------------
82 Primitive arithmetic functions, including multi-word arithmetic, and
83 division and square root approximations.  (Can be specialized to target if
84 desired.)
85 -------------------------------------------------------------------------------
86 */
87 #include "softfloat-macros"
88 
89 /*
90 -------------------------------------------------------------------------------
91 Functions and definitions to determine:  (1) whether tininess for underflow
92 is detected before or after rounding by default, (2) what (if anything)
93 happens when exceptions are raised, (3) how signaling NaNs are distinguished
94 from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
95 are propagated from function inputs to output.  These details are target-
96 specific.
97 -------------------------------------------------------------------------------
98 */
99 #include "softfloat-specialize"
100 
101 #if !defined(SOFTFLOAT_FOR_GCC) || defined(FLOATX80) || defined(FLOAT128)
102 /*
103 -------------------------------------------------------------------------------
104 Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
105 and 7, and returns the properly rounded 32-bit integer corresponding to the
106 input.  If `zSign' is 1, the input is negated before being converted to an
107 integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
108 is simply rounded to an integer, with the inexact exception raised if the
109 input cannot be represented exactly as an integer.  However, if the fixed-
110 point input is too large, the invalid exception is raised and the largest
111 positive or negative integer is returned.
112 -------------------------------------------------------------------------------
113 */
114 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
115 {
116     int8 roundingMode;
117     flag roundNearestEven;
118     int8 roundIncrement, roundBits;
119     int32 z;
120 
121     roundingMode = float_rounding_mode;
122     roundNearestEven = ( roundingMode == float_round_nearest_even );
123     roundIncrement = 0x40;
124     if ( ! roundNearestEven ) {
125         if ( roundingMode == float_round_to_zero ) {
126             roundIncrement = 0;
127         }
128         else {
129             roundIncrement = 0x7F;
130             if ( zSign ) {
131                 if ( roundingMode == float_round_up ) roundIncrement = 0;
132             }
133             else {
134                 if ( roundingMode == float_round_down ) roundIncrement = 0;
135             }
136         }
137     }
138     roundBits = absZ & 0x7F;
139     absZ = ( absZ + roundIncrement )>>7;
140     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
141     z = absZ;
142     if ( zSign ) z = - z;
143     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
144         float_raise( float_flag_invalid );
145         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
146     }
147     if ( roundBits ) float_exception_flags |= float_flag_inexact;
148     return z;
149 
150 }
151 
152 /*
153 -------------------------------------------------------------------------------
154 Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
155 `absZ1', with binary point between bits 63 and 64 (between the input words),
156 and returns the properly rounded 64-bit integer corresponding to the input.
157 If `zSign' is 1, the input is negated before being converted to an integer.
158 Ordinarily, the fixed-point input is simply rounded to an integer, with
159 the inexact exception raised if the input cannot be represented exactly as
160 an integer.  However, if the fixed-point input is too large, the invalid
161 exception is raised and the largest positive or negative integer is
162 returned.
163 -------------------------------------------------------------------------------
164 */
165 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
166 {
167     int8 roundingMode;
168     flag roundNearestEven, increment;
169     int64 z;
170 
171     roundingMode = float_rounding_mode;
172     roundNearestEven = ( roundingMode == float_round_nearest_even );
173     increment = ( (sbits64) absZ1 < 0 );
174     if ( ! roundNearestEven ) {
175         if ( roundingMode == float_round_to_zero ) {
176             increment = 0;
177         }
178         else {
179             if ( zSign ) {
180                 increment = ( roundingMode == float_round_down ) && absZ1;
181             }
182             else {
183                 increment = ( roundingMode == float_round_up ) && absZ1;
184             }
185         }
186     }
187     if ( increment ) {
188         ++absZ0;
189         if ( absZ0 == 0 ) goto overflow;
190         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
191     }
192     z = absZ0;
193     if ( zSign ) z = - z;
194     if ( z && ( ( z < 0 ) ^ zSign ) ) {
195  overflow:
196         float_raise( float_flag_invalid );
197         return
198               zSign ? (sbits64) LIT64( 0x8000000000000000 )
199             : LIT64( 0x7FFFFFFFFFFFFFFF );
200     }
201     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
202     return z;
203 
204 }
205 #endif
206 
207 /*
208 -------------------------------------------------------------------------------
209 Returns the fraction bits of the single-precision floating-point value `a'.
210 -------------------------------------------------------------------------------
211 */
212 INLINE bits32 extractFloat32Frac( float32 a )
213 {
214 
215     return a & 0x007FFFFF;
216 
217 }
218 
219 /*
220 -------------------------------------------------------------------------------
221 Returns the exponent bits of the single-precision floating-point value `a'.
222 -------------------------------------------------------------------------------
223 */
224 INLINE int16 extractFloat32Exp( float32 a )
225 {
226 
227     return ( a>>23 ) & 0xFF;
228 
229 }
230 
231 /*
232 -------------------------------------------------------------------------------
233 Returns the sign bit of the single-precision floating-point value `a'.
234 -------------------------------------------------------------------------------
235 */
236 INLINE flag extractFloat32Sign( float32 a )
237 {
238 
239     return a>>31;
240 
241 }
242 
243 /*
244 -------------------------------------------------------------------------------
245 Normalizes the subnormal single-precision floating-point value represented
246 by the denormalized significand `aSig'.  The normalized exponent and
247 significand are stored at the locations pointed to by `zExpPtr' and
248 `zSigPtr', respectively.
249 -------------------------------------------------------------------------------
250 */
251 static void
252  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
253 {
254     int8 shiftCount;
255 
256     shiftCount = countLeadingZeros32( aSig ) - 8;
257     *zSigPtr = aSig<<shiftCount;
258     *zExpPtr = 1 - shiftCount;
259 
260 }
261 
262 /*
263 -------------------------------------------------------------------------------
264 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
265 single-precision floating-point value, returning the result.  After being
266 shifted into the proper positions, the three fields are simply added
267 together to form the result.  This means that any integer portion of `zSig'
268 will be added into the exponent.  Since a properly normalized significand
269 will have an integer portion equal to 1, the `zExp' input should be 1 less
270 than the desired result exponent whenever `zSig' is a complete, normalized
271 significand.
272 -------------------------------------------------------------------------------
273 */
274 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
275 {
276 
277     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
278 
279 }
280 
281 /*
282 -------------------------------------------------------------------------------
283 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
284 and significand `zSig', and returns the proper single-precision floating-
285 point value corresponding to the abstract input.  Ordinarily, the abstract
286 value is simply rounded and packed into the single-precision format, with
287 the inexact exception raised if the abstract input cannot be represented
288 exactly.  However, if the abstract value is too large, the overflow and
289 inexact exceptions are raised and an infinity or maximal finite value is
290 returned.  If the abstract value is too small, the input value is rounded to
291 a subnormal number, and the underflow and inexact exceptions are raised if
292 the abstract input cannot be represented exactly as a subnormal single-
293 precision floating-point number.
294     The input significand `zSig' has its binary point between bits 30
295 and 29, which is 7 bits to the left of the usual location.  This shifted
296 significand must be normalized or smaller.  If `zSig' is not normalized,
297 `zExp' must be 0; in that case, the result returned is a subnormal number,
298 and it must not require rounding.  In the usual case that `zSig' is
299 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
300 The handling of underflow and overflow follows the IEC/IEEE Standard for
301 Binary Floating-Point Arithmetic.
302 -------------------------------------------------------------------------------
303 */
304 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
305 {
306     int8 roundingMode;
307     flag roundNearestEven;
308     int8 roundIncrement, roundBits;
309     flag isTiny;
310 
311     roundingMode = float_rounding_mode;
312     roundNearestEven = ( roundingMode == float_round_nearest_even );
313     roundIncrement = 0x40;
314     if ( ! roundNearestEven ) {
315         if ( roundingMode == float_round_to_zero ) {
316             roundIncrement = 0;
317         }
318         else {
319             roundIncrement = 0x7F;
320             if ( zSign ) {
321                 if ( roundingMode == float_round_up ) roundIncrement = 0;
322             }
323             else {
324                 if ( roundingMode == float_round_down ) roundIncrement = 0;
325             }
326         }
327     }
328     roundBits = zSig & 0x7F;
329     if ( 0xFD <= (bits16) zExp ) {
330         if (    ( 0xFD < zExp )
331              || (    ( zExp == 0xFD )
332                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
333            ) {
334             float_raise( float_flag_overflow | float_flag_inexact );
335             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
336         }
337         if ( zExp < 0 ) {
338             isTiny =
339                    ( float_detect_tininess == float_tininess_before_rounding )
340                 || ( zExp < -1 )
341                 || ( zSig + roundIncrement < 0x80000000 );
342             shift32RightJamming( zSig, - zExp, &zSig );
343             zExp = 0;
344             roundBits = zSig & 0x7F;
345             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
346         }
347     }
348     if ( roundBits ) float_exception_flags |= float_flag_inexact;
349     zSig = ( zSig + roundIncrement )>>7;
350     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
351     if ( zSig == 0 ) zExp = 0;
352     return packFloat32( zSign, zExp, zSig );
353 
354 }
355 
356 /*
357 -------------------------------------------------------------------------------
358 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
359 and significand `zSig', and returns the proper single-precision floating-
360 point value corresponding to the abstract input.  This routine is just like
361 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
362 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
363 floating-point exponent.
364 -------------------------------------------------------------------------------
365 */
366 static float32
367  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
368 {
369     int8 shiftCount;
370 
371     shiftCount = countLeadingZeros32( zSig ) - 1;
372     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
373 
374 }
375 
376 /*
377 -------------------------------------------------------------------------------
378 Returns the fraction bits of the double-precision floating-point value `a'.
379 -------------------------------------------------------------------------------
380 */
381 INLINE bits64 extractFloat64Frac( float64 a )
382 {
383 
384     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
385 
386 }
387 
388 /*
389 -------------------------------------------------------------------------------
390 Returns the exponent bits of the double-precision floating-point value `a'.
391 -------------------------------------------------------------------------------
392 */
393 INLINE int16 extractFloat64Exp( float64 a )
394 {
395 
396     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
397 
398 }
399 
400 /*
401 -------------------------------------------------------------------------------
402 Returns the sign bit of the double-precision floating-point value `a'.
403 -------------------------------------------------------------------------------
404 */
405 INLINE flag extractFloat64Sign( float64 a )
406 {
407 
408     return FLOAT64_DEMANGLE(a)>>63;
409 
410 }
411 
412 /*
413 -------------------------------------------------------------------------------
414 Normalizes the subnormal double-precision floating-point value represented
415 by the denormalized significand `aSig'.  The normalized exponent and
416 significand are stored at the locations pointed to by `zExpPtr' and
417 `zSigPtr', respectively.
418 -------------------------------------------------------------------------------
419 */
420 static void
421  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
422 {
423     int8 shiftCount;
424 
425     shiftCount = countLeadingZeros64( aSig ) - 11;
426     *zSigPtr = aSig<<shiftCount;
427     *zExpPtr = 1 - shiftCount;
428 
429 }
430 
431 /*
432 -------------------------------------------------------------------------------
433 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
434 double-precision floating-point value, returning the result.  After being
435 shifted into the proper positions, the three fields are simply added
436 together to form the result.  This means that any integer portion of `zSig'
437 will be added into the exponent.  Since a properly normalized significand
438 will have an integer portion equal to 1, the `zExp' input should be 1 less
439 than the desired result exponent whenever `zSig' is a complete, normalized
440 significand.
441 -------------------------------------------------------------------------------
442 */
443 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
444 {
445 
446     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
447 			   ( ( (bits64) zExp )<<52 ) + zSig );
448 
449 }
450 
451 /*
452 -------------------------------------------------------------------------------
453 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454 and significand `zSig', and returns the proper double-precision floating-
455 point value corresponding to the abstract input.  Ordinarily, the abstract
456 value is simply rounded and packed into the double-precision format, with
457 the inexact exception raised if the abstract input cannot be represented
458 exactly.  However, if the abstract value is too large, the overflow and
459 inexact exceptions are raised and an infinity or maximal finite value is
460 returned.  If the abstract value is too small, the input value is rounded to
461 a subnormal number, and the underflow and inexact exceptions are raised if
462 the abstract input cannot be represented exactly as a subnormal double-
463 precision floating-point number.
464     The input significand `zSig' has its binary point between bits 62
465 and 61, which is 10 bits to the left of the usual location.  This shifted
466 significand must be normalized or smaller.  If `zSig' is not normalized,
467 `zExp' must be 0; in that case, the result returned is a subnormal number,
468 and it must not require rounding.  In the usual case that `zSig' is
469 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
470 The handling of underflow and overflow follows the IEC/IEEE Standard for
471 Binary Floating-Point Arithmetic.
472 -------------------------------------------------------------------------------
473 */
474 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
475 {
476     int8 roundingMode;
477     flag roundNearestEven;
478     int16 roundIncrement, roundBits;
479     flag isTiny;
480 
481     roundingMode = float_rounding_mode;
482     roundNearestEven = ( roundingMode == float_round_nearest_even );
483     roundIncrement = 0x200;
484     if ( ! roundNearestEven ) {
485         if ( roundingMode == float_round_to_zero ) {
486             roundIncrement = 0;
487         }
488         else {
489             roundIncrement = 0x3FF;
490             if ( zSign ) {
491                 if ( roundingMode == float_round_up ) roundIncrement = 0;
492             }
493             else {
494                 if ( roundingMode == float_round_down ) roundIncrement = 0;
495             }
496         }
497     }
498     roundBits = zSig & 0x3FF;
499     if ( 0x7FD <= (bits16) zExp ) {
500         if (    ( 0x7FD < zExp )
501              || (    ( zExp == 0x7FD )
502                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
503            ) {
504             float_raise( float_flag_overflow | float_flag_inexact );
505             return FLOAT64_MANGLE(
506 		FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
507 		( roundIncrement == 0 ));
508         }
509         if ( zExp < 0 ) {
510             isTiny =
511                    ( float_detect_tininess == float_tininess_before_rounding )
512                 || ( zExp < -1 )
513                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
514             shift64RightJamming( zSig, - zExp, &zSig );
515             zExp = 0;
516             roundBits = zSig & 0x3FF;
517             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
518         }
519     }
520     if ( roundBits ) float_exception_flags |= float_flag_inexact;
521     zSig = ( zSig + roundIncrement )>>10;
522     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
523     if ( zSig == 0 ) zExp = 0;
524     return packFloat64( zSign, zExp, zSig );
525 
526 }
527 
528 /*
529 -------------------------------------------------------------------------------
530 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
531 and significand `zSig', and returns the proper double-precision floating-
532 point value corresponding to the abstract input.  This routine is just like
533 `roundAndPackFloat64' except that `zSig' does not have to be normalized.
534 Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
535 floating-point exponent.
536 -------------------------------------------------------------------------------
537 */
538 static float64
539  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
540 {
541     int8 shiftCount;
542 
543     shiftCount = countLeadingZeros64( zSig ) - 1;
544     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
545 
546 }
547 
548 #ifdef FLOATX80
549 
550 /*
551 -------------------------------------------------------------------------------
552 Returns the fraction bits of the extended double-precision floating-point
553 value `a'.
554 -------------------------------------------------------------------------------
555 */
556 INLINE bits64 extractFloatx80Frac( floatx80 a )
557 {
558 
559     return a.low;
560 
561 }
562 
563 /*
564 -------------------------------------------------------------------------------
565 Returns the exponent bits of the extended double-precision floating-point
566 value `a'.
567 -------------------------------------------------------------------------------
568 */
569 INLINE int32 extractFloatx80Exp( floatx80 a )
570 {
571 
572     return a.high & 0x7FFF;
573 
574 }
575 
576 /*
577 -------------------------------------------------------------------------------
578 Returns the sign bit of the extended double-precision floating-point value
579 `a'.
580 -------------------------------------------------------------------------------
581 */
582 INLINE flag extractFloatx80Sign( floatx80 a )
583 {
584 
585     return a.high>>15;
586 
587 }
588 
589 /*
590 -------------------------------------------------------------------------------
591 Normalizes the subnormal extended double-precision floating-point value
592 represented by the denormalized significand `aSig'.  The normalized exponent
593 and significand are stored at the locations pointed to by `zExpPtr' and
594 `zSigPtr', respectively.
595 -------------------------------------------------------------------------------
596 */
597 static void
598  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
599 {
600     int8 shiftCount;
601 
602     shiftCount = countLeadingZeros64( aSig );
603     *zSigPtr = aSig<<shiftCount;
604     *zExpPtr = 1 - shiftCount;
605 
606 }
607 
608 /*
609 -------------------------------------------------------------------------------
610 Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
611 extended double-precision floating-point value, returning the result.
612 -------------------------------------------------------------------------------
613 */
614 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
615 {
616     floatx80 z;
617 
618     z.low = zSig;
619     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
620     return z;
621 
622 }
623 
624 /*
625 -------------------------------------------------------------------------------
626 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
627 and extended significand formed by the concatenation of `zSig0' and `zSig1',
628 and returns the proper extended double-precision floating-point value
629 corresponding to the abstract input.  Ordinarily, the abstract value is
630 rounded and packed into the extended double-precision format, with the
631 inexact exception raised if the abstract input cannot be represented
632 exactly.  However, if the abstract value is too large, the overflow and
633 inexact exceptions are raised and an infinity or maximal finite value is
634 returned.  If the abstract value is too small, the input value is rounded to
635 a subnormal number, and the underflow and inexact exceptions are raised if
636 the abstract input cannot be represented exactly as a subnormal extended
637 double-precision floating-point number.
638     If `roundingPrecision' is 32 or 64, the result is rounded to the same
639 number of bits as single or double precision, respectively.  Otherwise, the
640 result is rounded to the full precision of the extended double-precision
641 format.
642     The input significand must be normalized or smaller.  If the input
643 significand is not normalized, `zExp' must be 0; in that case, the result
644 returned is a subnormal number, and it must not require rounding.  The
645 handling of underflow and overflow follows the IEC/IEEE Standard for Binary
646 Floating-Point Arithmetic.
647 -------------------------------------------------------------------------------
648 */
649 static floatx80
650  roundAndPackFloatx80(
651      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
652  )
653 {
654     int8 roundingMode;
655     flag roundNearestEven, increment, isTiny;
656     int64 roundIncrement, roundMask, roundBits;
657 
658     roundingMode = float_rounding_mode;
659     roundNearestEven = ( roundingMode == float_round_nearest_even );
660     if ( roundingPrecision == 80 ) goto precision80;
661     if ( roundingPrecision == 64 ) {
662         roundIncrement = LIT64( 0x0000000000000400 );
663         roundMask = LIT64( 0x00000000000007FF );
664     }
665     else if ( roundingPrecision == 32 ) {
666         roundIncrement = LIT64( 0x0000008000000000 );
667         roundMask = LIT64( 0x000000FFFFFFFFFF );
668     }
669     else {
670         goto precision80;
671     }
672     zSig0 |= ( zSig1 != 0 );
673     if ( ! roundNearestEven ) {
674         if ( roundingMode == float_round_to_zero ) {
675             roundIncrement = 0;
676         }
677         else {
678             roundIncrement = roundMask;
679             if ( zSign ) {
680                 if ( roundingMode == float_round_up ) roundIncrement = 0;
681             }
682             else {
683                 if ( roundingMode == float_round_down ) roundIncrement = 0;
684             }
685         }
686     }
687     roundBits = zSig0 & roundMask;
688     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
689         if (    ( 0x7FFE < zExp )
690              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
691            ) {
692             goto overflow;
693         }
694         if ( zExp <= 0 ) {
695             isTiny =
696                    ( float_detect_tininess == float_tininess_before_rounding )
697                 || ( zExp < 0 )
698                 || ( zSig0 <= zSig0 + roundIncrement );
699             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
700             zExp = 0;
701             roundBits = zSig0 & roundMask;
702             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
703             if ( roundBits ) float_exception_flags |= float_flag_inexact;
704             zSig0 += roundIncrement;
705             if ( (sbits64) zSig0 < 0 ) zExp = 1;
706             roundIncrement = roundMask + 1;
707             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
708                 roundMask |= roundIncrement;
709             }
710             zSig0 &= ~ roundMask;
711             return packFloatx80( zSign, zExp, zSig0 );
712         }
713     }
714     if ( roundBits ) float_exception_flags |= float_flag_inexact;
715     zSig0 += roundIncrement;
716     if ( zSig0 < roundIncrement ) {
717         ++zExp;
718         zSig0 = LIT64( 0x8000000000000000 );
719     }
720     roundIncrement = roundMask + 1;
721     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
722         roundMask |= roundIncrement;
723     }
724     zSig0 &= ~ roundMask;
725     if ( zSig0 == 0 ) zExp = 0;
726     return packFloatx80( zSign, zExp, zSig0 );
727  precision80:
728     increment = ( (sbits64) zSig1 < 0 );
729     if ( ! roundNearestEven ) {
730         if ( roundingMode == float_round_to_zero ) {
731             increment = 0;
732         }
733         else {
734             if ( zSign ) {
735                 increment = ( roundingMode == float_round_down ) && zSig1;
736             }
737             else {
738                 increment = ( roundingMode == float_round_up ) && zSig1;
739             }
740         }
741     }
742     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
743         if (    ( 0x7FFE < zExp )
744              || (    ( zExp == 0x7FFE )
745                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
746                   && increment
747                 )
748            ) {
749             roundMask = 0;
750  overflow:
751             float_raise( float_flag_overflow | float_flag_inexact );
752             if (    ( roundingMode == float_round_to_zero )
753                  || ( zSign && ( roundingMode == float_round_up ) )
754                  || ( ! zSign && ( roundingMode == float_round_down ) )
755                ) {
756                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
757             }
758             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
759         }
760         if ( zExp <= 0 ) {
761             isTiny =
762                    ( float_detect_tininess == float_tininess_before_rounding )
763                 || ( zExp < 0 )
764                 || ! increment
765                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
766             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
767             zExp = 0;
768             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
769             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
770             if ( roundNearestEven ) {
771                 increment = ( (sbits64) zSig1 < 0 );
772             }
773             else {
774                 if ( zSign ) {
775                     increment = ( roundingMode == float_round_down ) && zSig1;
776                 }
777                 else {
778                     increment = ( roundingMode == float_round_up ) && zSig1;
779                 }
780             }
781             if ( increment ) {
782                 ++zSig0;
783                 zSig0 &=
784                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
785                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
786             }
787             return packFloatx80( zSign, zExp, zSig0 );
788         }
789     }
790     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
791     if ( increment ) {
792         ++zSig0;
793         if ( zSig0 == 0 ) {
794             ++zExp;
795             zSig0 = LIT64( 0x8000000000000000 );
796         }
797         else {
798             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
799         }
800     }
801     else {
802         if ( zSig0 == 0 ) zExp = 0;
803     }
804     return packFloatx80( zSign, zExp, zSig0 );
805 
806 }
807 
808 /*
809 -------------------------------------------------------------------------------
810 Takes an abstract floating-point value having sign `zSign', exponent
811 `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
812 and returns the proper extended double-precision floating-point value
813 corresponding to the abstract input.  This routine is just like
814 `roundAndPackFloatx80' except that the input significand does not have to be
815 normalized.
816 -------------------------------------------------------------------------------
817 */
818 static floatx80
819  normalizeRoundAndPackFloatx80(
820      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
821  )
822 {
823     int8 shiftCount;
824 
825     if ( zSig0 == 0 ) {
826         zSig0 = zSig1;
827         zSig1 = 0;
828         zExp -= 64;
829     }
830     shiftCount = countLeadingZeros64( zSig0 );
831     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
832     zExp -= shiftCount;
833     return
834         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
835 
836 }
837 
838 #endif
839 
840 #ifdef FLOAT128
841 
842 /*
843 -------------------------------------------------------------------------------
844 Returns the least-significant 64 fraction bits of the quadruple-precision
845 floating-point value `a'.
846 -------------------------------------------------------------------------------
847 */
848 INLINE bits64 extractFloat128Frac1( float128 a )
849 {
850 
851     return a.low;
852 
853 }
854 
855 /*
856 -------------------------------------------------------------------------------
857 Returns the most-significant 48 fraction bits of the quadruple-precision
858 floating-point value `a'.
859 -------------------------------------------------------------------------------
860 */
861 INLINE bits64 extractFloat128Frac0( float128 a )
862 {
863 
864     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
865 
866 }
867 
868 /*
869 -------------------------------------------------------------------------------
870 Returns the exponent bits of the quadruple-precision floating-point value
871 `a'.
872 -------------------------------------------------------------------------------
873 */
874 INLINE int32 extractFloat128Exp( float128 a )
875 {
876 
877     return ( a.high>>48 ) & 0x7FFF;
878 
879 }
880 
881 /*
882 -------------------------------------------------------------------------------
883 Returns the sign bit of the quadruple-precision floating-point value `a'.
884 -------------------------------------------------------------------------------
885 */
886 INLINE flag extractFloat128Sign( float128 a )
887 {
888 
889     return a.high>>63;
890 
891 }
892 
893 /*
894 -------------------------------------------------------------------------------
895 Normalizes the subnormal quadruple-precision floating-point value
896 represented by the denormalized significand formed by the concatenation of
897 `aSig0' and `aSig1'.  The normalized exponent is stored at the location
898 pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
899 significand are stored at the location pointed to by `zSig0Ptr', and the
900 least significant 64 bits of the normalized significand are stored at the
901 location pointed to by `zSig1Ptr'.
902 -------------------------------------------------------------------------------
903 */
904 static void
905  normalizeFloat128Subnormal(
906      bits64 aSig0,
907      bits64 aSig1,
908      int32 *zExpPtr,
909      bits64 *zSig0Ptr,
910      bits64 *zSig1Ptr
911  )
912 {
913     int8 shiftCount;
914 
915     if ( aSig0 == 0 ) {
916         shiftCount = countLeadingZeros64( aSig1 ) - 15;
917         if ( shiftCount < 0 ) {
918             *zSig0Ptr = aSig1>>( - shiftCount );
919             *zSig1Ptr = aSig1<<( shiftCount & 63 );
920         }
921         else {
922             *zSig0Ptr = aSig1<<shiftCount;
923             *zSig1Ptr = 0;
924         }
925         *zExpPtr = - shiftCount - 63;
926     }
927     else {
928         shiftCount = countLeadingZeros64( aSig0 ) - 15;
929         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
930         *zExpPtr = 1 - shiftCount;
931     }
932 
933 }
934 
935 /*
936 -------------------------------------------------------------------------------
937 Packs the sign `zSign', the exponent `zExp', and the significand formed
938 by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
939 floating-point value, returning the result.  After being shifted into the
940 proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
941 added together to form the most significant 32 bits of the result.  This
942 means that any integer portion of `zSig0' will be added into the exponent.
943 Since a properly normalized significand will have an integer portion equal
944 to 1, the `zExp' input should be 1 less than the desired result exponent
945 whenever `zSig0' and `zSig1' concatenated form a complete, normalized
946 significand.
947 -------------------------------------------------------------------------------
948 */
949 INLINE float128
950  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
951 {
952     float128 z;
953 
954     z.low = zSig1;
955     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
956     return z;
957 
958 }
959 
960 /*
961 -------------------------------------------------------------------------------
962 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
963 and extended significand formed by the concatenation of `zSig0', `zSig1',
964 and `zSig2', and returns the proper quadruple-precision floating-point value
965 corresponding to the abstract input.  Ordinarily, the abstract value is
966 simply rounded and packed into the quadruple-precision format, with the
967 inexact exception raised if the abstract input cannot be represented
968 exactly.  However, if the abstract value is too large, the overflow and
969 inexact exceptions are raised and an infinity or maximal finite value is
970 returned.  If the abstract value is too small, the input value is rounded to
971 a subnormal number, and the underflow and inexact exceptions are raised if
972 the abstract input cannot be represented exactly as a subnormal quadruple-
973 precision floating-point number.
974     The input significand must be normalized or smaller.  If the input
975 significand is not normalized, `zExp' must be 0; in that case, the result
976 returned is a subnormal number, and it must not require rounding.  In the
977 usual case that the input significand is normalized, `zExp' must be 1 less
978 than the ``true'' floating-point exponent.  The handling of underflow and
979 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
980 -------------------------------------------------------------------------------
981 */
982 static float128
983  roundAndPackFloat128(
984      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
985 {
986     int8 roundingMode;
987     flag roundNearestEven, increment, isTiny;
988 
989     roundingMode = float_rounding_mode;
990     roundNearestEven = ( roundingMode == float_round_nearest_even );
991     increment = ( (sbits64) zSig2 < 0 );
992     if ( ! roundNearestEven ) {
993         if ( roundingMode == float_round_to_zero ) {
994             increment = 0;
995         }
996         else {
997             if ( zSign ) {
998                 increment = ( roundingMode == float_round_down ) && zSig2;
999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (bits32) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact );
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             isTiny =
1034                    ( float_detect_tininess == float_tininess_before_rounding )
1035                 || ( zExp < -1 )
1036                 || ! increment
1037                 || lt128(
1038                        zSig0,
1039                        zSig1,
1040                        LIT64( 0x0001FFFFFFFFFFFF ),
1041                        LIT64( 0xFFFFFFFFFFFFFFFF )
1042                    );
1043             shift128ExtraRightJamming(
1044                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1045             zExp = 0;
1046             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1047             if ( roundNearestEven ) {
1048                 increment = ( (sbits64) zSig2 < 0 );
1049             }
1050             else {
1051                 if ( zSign ) {
1052                     increment = ( roundingMode == float_round_down ) && zSig2;
1053                 }
1054                 else {
1055                     increment = ( roundingMode == float_round_up ) && zSig2;
1056                 }
1057             }
1058         }
1059     }
1060     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
1061     if ( increment ) {
1062         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1063         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1064     }
1065     else {
1066         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1067     }
1068     return packFloat128( zSign, zExp, zSig0, zSig1 );
1069 
1070 }
1071 
1072 /*
1073 -------------------------------------------------------------------------------
1074 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 returns the proper quadruple-precision floating-point value corresponding
1077 to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 except that the input significand has fewer bits and does not have to be
1079 normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 point exponent.
1081 -------------------------------------------------------------------------------
1082 */
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1086 {
1087     int8 shiftCount;
1088     bits64 zSig2;
1089 
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1106 
1107 }
1108 
1109 #endif
1110 
1111 /*
1112 -------------------------------------------------------------------------------
1113 Returns the result of converting the 32-bit two's complement integer `a'
1114 to the single-precision floating-point format.  The conversion is performed
1115 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1116 -------------------------------------------------------------------------------
1117 */
1118 float32 int32_to_float32( int32 a )
1119 {
1120     flag zSign;
1121 
1122     if ( a == 0 ) return 0;
1123     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1124     zSign = ( a < 0 );
1125     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1126 
1127 }
1128 
1129 float32 uint32_to_float32( uint32 a )
1130 {
1131     if ( a == 0 ) return 0;
1132     if ( a & (bits32) 0x80000000 )
1133 	return normalizeRoundAndPackFloat32( 0, 0x9D, a >> 1 );
1134     return normalizeRoundAndPackFloat32( 0, 0x9C, a );
1135 }
1136 
1137 
1138 /*
1139 -------------------------------------------------------------------------------
1140 Returns the result of converting the 32-bit two's complement integer `a'
1141 to the double-precision floating-point format.  The conversion is performed
1142 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1143 -------------------------------------------------------------------------------
1144 */
1145 float64 int32_to_float64( int32 a )
1146 {
1147     flag zSign;
1148     uint32 absA;
1149     int8 shiftCount;
1150     bits64 zSig;
1151 
1152     if ( a == 0 ) return 0;
1153     zSign = ( a < 0 );
1154     absA = zSign ? - a : a;
1155     shiftCount = countLeadingZeros32( absA ) + 21;
1156     zSig = absA;
1157     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1158 
1159 }
1160 
1161 float64 uint32_to_float64( uint32 a )
1162 {
1163     int8 shiftCount;
1164     bits64 zSig = a;
1165 
1166     if ( a == 0 ) return 0;
1167     shiftCount = countLeadingZeros32( a ) + 21;
1168     return packFloat64( 0, 0x432 - shiftCount, zSig<<shiftCount );
1169 
1170 }
1171 
1172 #ifdef FLOATX80
1173 
1174 /*
1175 -------------------------------------------------------------------------------
1176 Returns the result of converting the 32-bit two's complement integer `a'
1177 to the extended double-precision floating-point format.  The conversion
1178 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1179 Arithmetic.
1180 -------------------------------------------------------------------------------
1181 */
1182 floatx80 int32_to_floatx80( int32 a )
1183 {
1184     flag zSign;
1185     uint32 absA;
1186     int8 shiftCount;
1187     bits64 zSig;
1188 
1189     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1190     zSign = ( a < 0 );
1191     absA = zSign ? - a : a;
1192     shiftCount = countLeadingZeros32( absA ) + 32;
1193     zSig = absA;
1194     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1195 
1196 }
1197 
1198 floatx80 uint32_to_floatx80( uint32 a )
1199 {
1200     int8 shiftCount;
1201     bits64 zSig = a;
1202 
1203     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1204     shiftCount = countLeadingZeros32( a ) + 32;
1205     return packFloatx80( 0, 0x403E - shiftCount, zSig<<shiftCount );
1206 
1207 }
1208 
1209 #endif
1210 
1211 #ifdef FLOAT128
1212 
1213 /*
1214 -------------------------------------------------------------------------------
1215 Returns the result of converting the 32-bit two's complement integer `a' to
1216 the quadruple-precision floating-point format.  The conversion is performed
1217 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1218 -------------------------------------------------------------------------------
1219 */
1220 float128 int32_to_float128( int32 a )
1221 {
1222     flag zSign;
1223     uint32 absA;
1224     int8 shiftCount;
1225     bits64 zSig0;
1226 
1227     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1228     zSign = ( a < 0 );
1229     absA = zSign ? - a : a;
1230     shiftCount = countLeadingZeros32( absA ) + 17;
1231     zSig0 = absA;
1232     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1233 
1234 }
1235 
1236 float128 uint32_to_float128( uint32 a )
1237 {
1238     int8 shiftCount;
1239     bits64 zSig0 = a;
1240 
1241     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1242     shiftCount = countLeadingZeros32( a ) + 17;
1243     return packFloat128( 0, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1244 
1245 }
1246 
1247 #endif
1248 
1249 #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1250 /*
1251 -------------------------------------------------------------------------------
1252 Returns the result of converting the 64-bit two's complement integer `a'
1253 to the single-precision floating-point format.  The conversion is performed
1254 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 -------------------------------------------------------------------------------
1256 */
1257 float32 int64_to_float32( int64 a )
1258 {
1259     flag zSign;
1260     uint64 absA;
1261     int8 shiftCount;
1262 
1263     if ( a == 0 ) return 0;
1264     zSign = ( a < 0 );
1265     absA = zSign ? - a : a;
1266     shiftCount = countLeadingZeros64( absA ) - 40;
1267     if ( 0 <= shiftCount ) {
1268         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1269     }
1270     else {
1271         shiftCount += 7;
1272         if ( shiftCount < 0 ) {
1273             shift64RightJamming( absA, - shiftCount, &absA );
1274         }
1275         else {
1276             absA <<= shiftCount;
1277         }
1278         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1279     }
1280 
1281 }
1282 
1283 /*
1284 -------------------------------------------------------------------------------
1285 Returns the result of converting the 64-bit two's complement integer `a'
1286 to the double-precision floating-point format.  The conversion is performed
1287 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1288 -------------------------------------------------------------------------------
1289 */
1290 float64 int64_to_float64( int64 a )
1291 {
1292     flag zSign;
1293 
1294     if ( a == 0 ) return 0;
1295     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1296         return packFloat64( 1, 0x43E, 0 );
1297     }
1298     zSign = ( a < 0 );
1299     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1300 
1301 }
1302 
1303 #ifdef FLOATX80
1304 
1305 /*
1306 -------------------------------------------------------------------------------
1307 Returns the result of converting the 64-bit two's complement integer `a'
1308 to the extended double-precision floating-point format.  The conversion
1309 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1310 Arithmetic.
1311 -------------------------------------------------------------------------------
1312 */
1313 floatx80 int64_to_floatx80( int64 a )
1314 {
1315     flag zSign;
1316     uint64 absA;
1317     int8 shiftCount;
1318 
1319     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1320     zSign = ( a < 0 );
1321     absA = zSign ? - a : a;
1322     shiftCount = countLeadingZeros64( absA );
1323     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1324 
1325 }
1326 
1327 #endif
1328 
1329 #endif /* !SOFTFLOAT_FOR_GCC */
1330 
1331 #ifdef FLOAT128
1332 
1333 /*
1334 -------------------------------------------------------------------------------
1335 Returns the result of converting the 64-bit two's complement integer `a' to
1336 the quadruple-precision floating-point format.  The conversion is performed
1337 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1338 -------------------------------------------------------------------------------
1339 */
1340 float128 int64_to_float128( int64 a )
1341 {
1342     flag zSign;
1343     uint64 absA;
1344     int8 shiftCount;
1345     int32 zExp;
1346     bits64 zSig0, zSig1;
1347 
1348     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1349     zSign = ( a < 0 );
1350     absA = zSign ? - a : a;
1351     shiftCount = countLeadingZeros64( absA ) + 49;
1352     zExp = 0x406E - shiftCount;
1353     if ( 64 <= shiftCount ) {
1354         zSig1 = 0;
1355         zSig0 = absA;
1356         shiftCount -= 64;
1357     }
1358     else {
1359         zSig1 = absA;
1360         zSig0 = 0;
1361     }
1362     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1363     return packFloat128( zSign, zExp, zSig0, zSig1 );
1364 
1365 }
1366 
1367 #endif
1368 
1369 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1370 /*
1371 -------------------------------------------------------------------------------
1372 Returns the result of converting the single-precision floating-point value
1373 `a' to the 32-bit two's complement integer format.  The conversion is
1374 performed according to the IEC/IEEE Standard for Binary Floating-Point
1375 Arithmetic---which means in particular that the conversion is rounded
1376 according to the current rounding mode.  If `a' is a NaN, the largest
1377 positive integer is returned.  Otherwise, if the conversion overflows, the
1378 largest integer with the same sign as `a' is returned.
1379 -------------------------------------------------------------------------------
1380 */
1381 int32 float32_to_int32( float32 a )
1382 {
1383     flag aSign;
1384     int16 aExp, shiftCount;
1385     bits32 aSig;
1386     bits64 aSig64;
1387 
1388     aSig = extractFloat32Frac( a );
1389     aExp = extractFloat32Exp( a );
1390     aSign = extractFloat32Sign( a );
1391     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1392     if ( aExp ) aSig |= 0x00800000;
1393     shiftCount = 0xAF - aExp;
1394     aSig64 = aSig;
1395     aSig64 <<= 32;
1396     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1397     return roundAndPackInt32( aSign, aSig64 );
1398 
1399 }
1400 #endif /* !SOFTFLOAT_FOR_GCC */
1401 
1402 /*
1403 -------------------------------------------------------------------------------
1404 Returns the result of converting the single-precision floating-point value
1405 `a' to the 32-bit two's complement integer format.  The conversion is
1406 performed according to the IEC/IEEE Standard for Binary Floating-Point
1407 Arithmetic, except that the conversion is always rounded toward zero.
1408 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1409 the conversion overflows, the largest integer with the same sign as `a' is
1410 returned.
1411 -------------------------------------------------------------------------------
1412 */
1413 int32 float32_to_int32_round_to_zero( float32 a )
1414 {
1415     flag aSign;
1416     int16 aExp, shiftCount;
1417     bits32 aSig;
1418     int32 z;
1419 
1420     aSig = extractFloat32Frac( a );
1421     aExp = extractFloat32Exp( a );
1422     aSign = extractFloat32Sign( a );
1423     shiftCount = aExp - 0x9E;
1424     if ( 0 <= shiftCount ) {
1425         if ( a != 0xCF000000 ) {
1426             float_raise( float_flag_invalid );
1427             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1428         }
1429         return (sbits32) 0x80000000;
1430     }
1431     else if ( aExp <= 0x7E ) {
1432         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1433         return 0;
1434     }
1435     aSig = ( aSig | 0x00800000 )<<8;
1436     z = aSig>>( - shiftCount );
1437     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1438         float_exception_flags |= float_flag_inexact;
1439     }
1440     if ( aSign ) z = - z;
1441     return z;
1442 
1443 }
1444 
1445 #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1446 /*
1447 -------------------------------------------------------------------------------
1448 Returns the result of converting the single-precision floating-point value
1449 `a' to the 64-bit two's complement integer format.  The conversion is
1450 performed according to the IEC/IEEE Standard for Binary Floating-Point
1451 Arithmetic---which means in particular that the conversion is rounded
1452 according to the current rounding mode.  If `a' is a NaN, the largest
1453 positive integer is returned.  Otherwise, if the conversion overflows, the
1454 largest integer with the same sign as `a' is returned.
1455 -------------------------------------------------------------------------------
1456 */
1457 int64 float32_to_int64( float32 a )
1458 {
1459     flag aSign;
1460     int16 aExp, shiftCount;
1461     bits32 aSig;
1462     bits64 aSig64, aSigExtra;
1463 
1464     aSig = extractFloat32Frac( a );
1465     aExp = extractFloat32Exp( a );
1466     aSign = extractFloat32Sign( a );
1467     shiftCount = 0xBE - aExp;
1468     if ( shiftCount < 0 ) {
1469         float_raise( float_flag_invalid );
1470         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1471             return LIT64( 0x7FFFFFFFFFFFFFFF );
1472         }
1473         return (sbits64) LIT64( 0x8000000000000000 );
1474     }
1475     if ( aExp ) aSig |= 0x00800000;
1476     aSig64 = aSig;
1477     aSig64 <<= 40;
1478     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1479     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1480 
1481 }
1482 
1483 /*
1484 -------------------------------------------------------------------------------
1485 Returns the result of converting the single-precision floating-point value
1486 `a' to the 64-bit two's complement integer format.  The conversion is
1487 performed according to the IEC/IEEE Standard for Binary Floating-Point
1488 Arithmetic, except that the conversion is always rounded toward zero.  If
1489 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1490 conversion overflows, the largest integer with the same sign as `a' is
1491 returned.
1492 -------------------------------------------------------------------------------
1493 */
1494 int64 float32_to_int64_round_to_zero( float32 a )
1495 {
1496     flag aSign;
1497     int16 aExp, shiftCount;
1498     bits32 aSig;
1499     bits64 aSig64;
1500     int64 z;
1501 
1502     aSig = extractFloat32Frac( a );
1503     aExp = extractFloat32Exp( a );
1504     aSign = extractFloat32Sign( a );
1505     shiftCount = aExp - 0xBE;
1506     if ( 0 <= shiftCount ) {
1507         if ( a != 0xDF000000 ) {
1508             float_raise( float_flag_invalid );
1509             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1510                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1511             }
1512         }
1513         return (sbits64) LIT64( 0x8000000000000000 );
1514     }
1515     else if ( aExp <= 0x7E ) {
1516         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1517         return 0;
1518     }
1519     aSig64 = aSig | 0x00800000;
1520     aSig64 <<= 40;
1521     z = aSig64>>( - shiftCount );
1522     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1523         float_exception_flags |= float_flag_inexact;
1524     }
1525     if ( aSign ) z = - z;
1526     return z;
1527 
1528 }
1529 #endif /* !SOFTFLOAT_FOR_GCC */
1530 
1531 /*
1532 -------------------------------------------------------------------------------
1533 Returns the result of converting the single-precision floating-point value
1534 `a' to the double-precision floating-point format.  The conversion is
1535 performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 Arithmetic.
1537 -------------------------------------------------------------------------------
1538 */
1539 float64 float32_to_float64( float32 a )
1540 {
1541     flag aSign;
1542     int16 aExp;
1543     bits32 aSig;
1544 
1545     aSig = extractFloat32Frac( a );
1546     aExp = extractFloat32Exp( a );
1547     aSign = extractFloat32Sign( a );
1548     if ( aExp == 0xFF ) {
1549         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1550         return packFloat64( aSign, 0x7FF, 0 );
1551     }
1552     if ( aExp == 0 ) {
1553         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1554         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1555         --aExp;
1556     }
1557     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1558 
1559 }
1560 
1561 #ifdef FLOATX80
1562 
1563 /*
1564 -------------------------------------------------------------------------------
1565 Returns the result of converting the single-precision floating-point value
1566 `a' to the extended double-precision floating-point format.  The conversion
1567 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 Arithmetic.
1569 -------------------------------------------------------------------------------
1570 */
1571 floatx80 float32_to_floatx80( float32 a )
1572 {
1573     flag aSign;
1574     int16 aExp;
1575     bits32 aSig;
1576 
1577     aSig = extractFloat32Frac( a );
1578     aExp = extractFloat32Exp( a );
1579     aSign = extractFloat32Sign( a );
1580     if ( aExp == 0xFF ) {
1581         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1582         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1583     }
1584     if ( aExp == 0 ) {
1585         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1586         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587     }
1588     aSig |= 0x00800000;
1589     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1590 
1591 }
1592 
1593 #endif
1594 
1595 #ifdef FLOAT128
1596 
1597 /*
1598 -------------------------------------------------------------------------------
1599 Returns the result of converting the single-precision floating-point value
1600 `a' to the double-precision floating-point format.  The conversion is
1601 performed according to the IEC/IEEE Standard for Binary Floating-Point
1602 Arithmetic.
1603 -------------------------------------------------------------------------------
1604 */
1605 float128 float32_to_float128( float32 a )
1606 {
1607     flag aSign;
1608     int16 aExp;
1609     bits32 aSig;
1610 
1611     aSig = extractFloat32Frac( a );
1612     aExp = extractFloat32Exp( a );
1613     aSign = extractFloat32Sign( a );
1614     if ( aExp == 0xFF ) {
1615         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1616         return packFloat128( aSign, 0x7FFF, 0, 0 );
1617     }
1618     if ( aExp == 0 ) {
1619         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1620         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1621         --aExp;
1622     }
1623     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1624 
1625 }
1626 
1627 #endif
1628 
1629 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1630 /*
1631 -------------------------------------------------------------------------------
1632 Rounds the single-precision floating-point value `a' to an integer, and
1633 returns the result as a single-precision floating-point value.  The
1634 operation is performed according to the IEC/IEEE Standard for Binary
1635 Floating-Point Arithmetic.
1636 -------------------------------------------------------------------------------
1637 */
1638 float32 float32_round_to_int( float32 a )
1639 {
1640     flag aSign;
1641     int16 aExp;
1642     bits32 lastBitMask, roundBitsMask;
1643     int8 roundingMode;
1644     float32 z;
1645 
1646     aExp = extractFloat32Exp( a );
1647     if ( 0x96 <= aExp ) {
1648         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1649             return propagateFloat32NaN( a, a );
1650         }
1651         return a;
1652     }
1653     if ( aExp <= 0x7E ) {
1654         if ( (bits32) ( a<<1 ) == 0 ) return a;
1655         float_exception_flags |= float_flag_inexact;
1656         aSign = extractFloat32Sign( a );
1657         switch ( float_rounding_mode ) {
1658          case float_round_nearest_even:
1659             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1660                 return packFloat32( aSign, 0x7F, 0 );
1661             }
1662             break;
1663 	 case float_round_to_zero:
1664 	    break;
1665          case float_round_down:
1666             return aSign ? 0xBF800000 : 0;
1667          case float_round_up:
1668             return aSign ? 0x80000000 : 0x3F800000;
1669         }
1670         return packFloat32( aSign, 0, 0 );
1671     }
1672     lastBitMask = 1;
1673     lastBitMask <<= 0x96 - aExp;
1674     roundBitsMask = lastBitMask - 1;
1675     z = a;
1676     roundingMode = float_rounding_mode;
1677     if ( roundingMode == float_round_nearest_even ) {
1678         z += lastBitMask>>1;
1679         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1680     }
1681     else if ( roundingMode != float_round_to_zero ) {
1682         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1683             z += roundBitsMask;
1684         }
1685     }
1686     z &= ~ roundBitsMask;
1687     if ( z != a ) float_exception_flags |= float_flag_inexact;
1688     return z;
1689 
1690 }
1691 #endif /* !SOFTFLOAT_FOR_GCC */
1692 
1693 /*
1694 -------------------------------------------------------------------------------
1695 Returns the result of adding the absolute values of the single-precision
1696 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1697 before being returned.  `zSign' is ignored if the result is a NaN.
1698 The addition is performed according to the IEC/IEEE Standard for Binary
1699 Floating-Point Arithmetic.
1700 -------------------------------------------------------------------------------
1701 */
1702 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1703 {
1704     int16 aExp, bExp, zExp;
1705     bits32 aSig, bSig, zSig;
1706     int16 expDiff;
1707 
1708     aSig = extractFloat32Frac( a );
1709     aExp = extractFloat32Exp( a );
1710     bSig = extractFloat32Frac( b );
1711     bExp = extractFloat32Exp( b );
1712     expDiff = aExp - bExp;
1713     aSig <<= 6;
1714     bSig <<= 6;
1715     if ( 0 < expDiff ) {
1716         if ( aExp == 0xFF ) {
1717             if ( aSig ) return propagateFloat32NaN( a, b );
1718             return a;
1719         }
1720         if ( bExp == 0 ) {
1721             --expDiff;
1722         }
1723         else {
1724             bSig |= 0x20000000;
1725         }
1726         shift32RightJamming( bSig, expDiff, &bSig );
1727         zExp = aExp;
1728     }
1729     else if ( expDiff < 0 ) {
1730         if ( bExp == 0xFF ) {
1731             if ( bSig ) return propagateFloat32NaN( a, b );
1732             return packFloat32( zSign, 0xFF, 0 );
1733         }
1734         if ( aExp == 0 ) {
1735             ++expDiff;
1736         }
1737         else {
1738             aSig |= 0x20000000;
1739         }
1740         shift32RightJamming( aSig, - expDiff, &aSig );
1741         zExp = bExp;
1742     }
1743     else {
1744         if ( aExp == 0xFF ) {
1745             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1746             return a;
1747         }
1748         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1749         zSig = 0x40000000 + aSig + bSig;
1750         zExp = aExp;
1751         goto roundAndPack;
1752     }
1753     aSig |= 0x20000000;
1754     zSig = ( aSig + bSig )<<1;
1755     --zExp;
1756     if ( (sbits32) zSig < 0 ) {
1757         zSig = aSig + bSig;
1758         ++zExp;
1759     }
1760  roundAndPack:
1761     return roundAndPackFloat32( zSign, zExp, zSig );
1762 
1763 }
1764 
1765 /*
1766 -------------------------------------------------------------------------------
1767 Returns the result of subtracting the absolute values of the single-
1768 precision floating-point values `a' and `b'.  If `zSign' is 1, the
1769 difference is negated before being returned.  `zSign' is ignored if the
1770 result is a NaN.  The subtraction is performed according to the IEC/IEEE
1771 Standard for Binary Floating-Point Arithmetic.
1772 -------------------------------------------------------------------------------
1773 */
1774 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1775 {
1776     int16 aExp, bExp, zExp;
1777     bits32 aSig, bSig, zSig;
1778     int16 expDiff;
1779 
1780     aSig = extractFloat32Frac( a );
1781     aExp = extractFloat32Exp( a );
1782     bSig = extractFloat32Frac( b );
1783     bExp = extractFloat32Exp( b );
1784     expDiff = aExp - bExp;
1785     aSig <<= 7;
1786     bSig <<= 7;
1787     if ( 0 < expDiff ) goto aExpBigger;
1788     if ( expDiff < 0 ) goto bExpBigger;
1789     if ( aExp == 0xFF ) {
1790         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1791         float_raise( float_flag_invalid );
1792         return float32_default_nan;
1793     }
1794     if ( aExp == 0 ) {
1795         aExp = 1;
1796         bExp = 1;
1797     }
1798     if ( bSig < aSig ) goto aBigger;
1799     if ( aSig < bSig ) goto bBigger;
1800     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1801  bExpBigger:
1802     if ( bExp == 0xFF ) {
1803         if ( bSig ) return propagateFloat32NaN( a, b );
1804         return packFloat32( zSign ^ 1, 0xFF, 0 );
1805     }
1806     if ( aExp == 0 ) {
1807         ++expDiff;
1808     }
1809     else {
1810         aSig |= 0x40000000;
1811     }
1812     shift32RightJamming( aSig, - expDiff, &aSig );
1813     bSig |= 0x40000000;
1814  bBigger:
1815     zSig = bSig - aSig;
1816     zExp = bExp;
1817     zSign ^= 1;
1818     goto normalizeRoundAndPack;
1819  aExpBigger:
1820     if ( aExp == 0xFF ) {
1821         if ( aSig ) return propagateFloat32NaN( a, b );
1822         return a;
1823     }
1824     if ( bExp == 0 ) {
1825         --expDiff;
1826     }
1827     else {
1828         bSig |= 0x40000000;
1829     }
1830     shift32RightJamming( bSig, expDiff, &bSig );
1831     aSig |= 0x40000000;
1832  aBigger:
1833     zSig = aSig - bSig;
1834     zExp = aExp;
1835  normalizeRoundAndPack:
1836     --zExp;
1837     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1838 
1839 }
1840 
1841 /*
1842 -------------------------------------------------------------------------------
1843 Returns the result of adding the single-precision floating-point values `a'
1844 and `b'.  The operation is performed according to the IEC/IEEE Standard for
1845 Binary Floating-Point Arithmetic.
1846 -------------------------------------------------------------------------------
1847 */
1848 float32 float32_add( float32 a, float32 b )
1849 {
1850     flag aSign, bSign;
1851 
1852     aSign = extractFloat32Sign( a );
1853     bSign = extractFloat32Sign( b );
1854     if ( aSign == bSign ) {
1855         return addFloat32Sigs( a, b, aSign );
1856     }
1857     else {
1858         return subFloat32Sigs( a, b, aSign );
1859     }
1860 
1861 }
1862 
1863 /*
1864 -------------------------------------------------------------------------------
1865 Returns the result of subtracting the single-precision floating-point values
1866 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1867 for Binary Floating-Point Arithmetic.
1868 -------------------------------------------------------------------------------
1869 */
1870 float32 float32_sub( float32 a, float32 b )
1871 {
1872     flag aSign, bSign;
1873 
1874     aSign = extractFloat32Sign( a );
1875     bSign = extractFloat32Sign( b );
1876     if ( aSign == bSign ) {
1877         return subFloat32Sigs( a, b, aSign );
1878     }
1879     else {
1880         return addFloat32Sigs( a, b, aSign );
1881     }
1882 
1883 }
1884 
1885 /*
1886 -------------------------------------------------------------------------------
1887 Returns the result of multiplying the single-precision floating-point values
1888 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1889 for Binary Floating-Point Arithmetic.
1890 -------------------------------------------------------------------------------
1891 */
1892 float32 float32_mul( float32 a, float32 b )
1893 {
1894     flag aSign, bSign, zSign;
1895     int16 aExp, bExp, zExp;
1896     bits32 aSig, bSig;
1897     bits64 zSig64;
1898     bits32 zSig;
1899 
1900     aSig = extractFloat32Frac( a );
1901     aExp = extractFloat32Exp( a );
1902     aSign = extractFloat32Sign( a );
1903     bSig = extractFloat32Frac( b );
1904     bExp = extractFloat32Exp( b );
1905     bSign = extractFloat32Sign( b );
1906     zSign = aSign ^ bSign;
1907     if ( aExp == 0xFF ) {
1908         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1909             return propagateFloat32NaN( a, b );
1910         }
1911         if ( ( bExp | bSig ) == 0 ) {
1912             float_raise( float_flag_invalid );
1913             return float32_default_nan;
1914         }
1915         return packFloat32( zSign, 0xFF, 0 );
1916     }
1917     if ( bExp == 0xFF ) {
1918         if ( bSig ) return propagateFloat32NaN( a, b );
1919         if ( ( aExp | aSig ) == 0 ) {
1920             float_raise( float_flag_invalid );
1921             return float32_default_nan;
1922         }
1923         return packFloat32( zSign, 0xFF, 0 );
1924     }
1925     if ( aExp == 0 ) {
1926         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1927         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1928     }
1929     if ( bExp == 0 ) {
1930         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1931         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1932     }
1933     zExp = aExp + bExp - 0x7F;
1934     aSig = ( aSig | 0x00800000 )<<7;
1935     bSig = ( bSig | 0x00800000 )<<8;
1936     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1937     zSig = zSig64;
1938     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1939         zSig <<= 1;
1940         --zExp;
1941     }
1942     return roundAndPackFloat32( zSign, zExp, zSig );
1943 
1944 }
1945 
1946 /*
1947 -------------------------------------------------------------------------------
1948 Returns the result of dividing the single-precision floating-point value `a'
1949 by the corresponding value `b'.  The operation is performed according to the
1950 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1951 -------------------------------------------------------------------------------
1952 */
1953 float32 float32_div( float32 a, float32 b )
1954 {
1955     flag aSign, bSign, zSign;
1956     int16 aExp, bExp, zExp;
1957     bits32 aSig, bSig, zSig;
1958 
1959     aSig = extractFloat32Frac( a );
1960     aExp = extractFloat32Exp( a );
1961     aSign = extractFloat32Sign( a );
1962     bSig = extractFloat32Frac( b );
1963     bExp = extractFloat32Exp( b );
1964     bSign = extractFloat32Sign( b );
1965     zSign = aSign ^ bSign;
1966     if ( aExp == 0xFF ) {
1967         if ( aSig ) return propagateFloat32NaN( a, b );
1968         if ( bExp == 0xFF ) {
1969             if ( bSig ) return propagateFloat32NaN( a, b );
1970             float_raise( float_flag_invalid );
1971             return float32_default_nan;
1972         }
1973         return packFloat32( zSign, 0xFF, 0 );
1974     }
1975     if ( bExp == 0xFF ) {
1976         if ( bSig ) return propagateFloat32NaN( a, b );
1977         return packFloat32( zSign, 0, 0 );
1978     }
1979     if ( bExp == 0 ) {
1980         if ( bSig == 0 ) {
1981             if ( ( aExp | aSig ) == 0 ) {
1982                 float_raise( float_flag_invalid );
1983                 return float32_default_nan;
1984             }
1985             float_raise( float_flag_divbyzero );
1986             return packFloat32( zSign, 0xFF, 0 );
1987         }
1988         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1989     }
1990     if ( aExp == 0 ) {
1991         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1992         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1993     }
1994     zExp = aExp - bExp + 0x7D;
1995     aSig = ( aSig | 0x00800000 )<<7;
1996     bSig = ( bSig | 0x00800000 )<<8;
1997     if ( bSig <= ( aSig + aSig ) ) {
1998         aSig >>= 1;
1999         ++zExp;
2000     }
2001     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2002     if ( ( zSig & 0x3F ) == 0 ) {
2003         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2004     }
2005     return roundAndPackFloat32( zSign, zExp, zSig );
2006 
2007 }
2008 
2009 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2010 /*
2011 -------------------------------------------------------------------------------
2012 Returns the remainder of the single-precision floating-point value `a'
2013 with respect to the corresponding value `b'.  The operation is performed
2014 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2015 -------------------------------------------------------------------------------
2016 */
2017 float32 float32_rem( float32 a, float32 b )
2018 {
2019     flag aSign, bSign, zSign;
2020     int16 aExp, bExp, expDiff;
2021     bits32 aSig, bSig;
2022     bits32 q;
2023     bits64 aSig64, bSig64, q64;
2024     bits32 alternateASig;
2025     sbits32 sigMean;
2026 
2027     aSig = extractFloat32Frac( a );
2028     aExp = extractFloat32Exp( a );
2029     aSign = extractFloat32Sign( a );
2030     bSig = extractFloat32Frac( b );
2031     bExp = extractFloat32Exp( b );
2032     bSign = extractFloat32Sign( b );
2033     if ( aExp == 0xFF ) {
2034         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2035             return propagateFloat32NaN( a, b );
2036         }
2037         float_raise( float_flag_invalid );
2038         return float32_default_nan;
2039     }
2040     if ( bExp == 0xFF ) {
2041         if ( bSig ) return propagateFloat32NaN( a, b );
2042         return a;
2043     }
2044     if ( bExp == 0 ) {
2045         if ( bSig == 0 ) {
2046             float_raise( float_flag_invalid );
2047             return float32_default_nan;
2048         }
2049         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2050     }
2051     if ( aExp == 0 ) {
2052         if ( aSig == 0 ) return a;
2053         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2054     }
2055     expDiff = aExp - bExp;
2056     aSig |= 0x00800000;
2057     bSig |= 0x00800000;
2058     if ( expDiff < 32 ) {
2059         aSig <<= 8;
2060         bSig <<= 8;
2061         if ( expDiff < 0 ) {
2062             if ( expDiff < -1 ) return a;
2063             aSig >>= 1;
2064         }
2065         q = ( bSig <= aSig );
2066         if ( q ) aSig -= bSig;
2067         if ( 0 < expDiff ) {
2068             q = ( ( (bits64) aSig )<<32 ) / bSig;
2069             q >>= 32 - expDiff;
2070             bSig >>= 2;
2071             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2072         }
2073         else {
2074             aSig >>= 2;
2075             bSig >>= 2;
2076         }
2077     }
2078     else {
2079         if ( bSig <= aSig ) aSig -= bSig;
2080         aSig64 = ( (bits64) aSig )<<40;
2081         bSig64 = ( (bits64) bSig )<<40;
2082         expDiff -= 64;
2083         while ( 0 < expDiff ) {
2084             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2085             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2086             aSig64 = - ( ( bSig * q64 )<<38 );
2087             expDiff -= 62;
2088         }
2089         expDiff += 64;
2090         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2091         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2092         q = q64>>( 64 - expDiff );
2093         bSig <<= 6;
2094         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2095     }
2096     do {
2097         alternateASig = aSig;
2098         ++q;
2099         aSig -= bSig;
2100     } while ( 0 <= (sbits32) aSig );
2101     sigMean = aSig + alternateASig;
2102     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2103         aSig = alternateASig;
2104     }
2105     zSign = ( (sbits32) aSig < 0 );
2106     if ( zSign ) aSig = - aSig;
2107     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2108 
2109 }
2110 #endif /* !SOFTFLOAT_FOR_GCC */
2111 
2112 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2113 /*
2114 -------------------------------------------------------------------------------
2115 Returns the square root of the single-precision floating-point value `a'.
2116 The operation is performed according to the IEC/IEEE Standard for Binary
2117 Floating-Point Arithmetic.
2118 -------------------------------------------------------------------------------
2119 */
2120 float32 float32_sqrt( float32 a )
2121 {
2122     flag aSign;
2123     int16 aExp, zExp;
2124     bits32 aSig, zSig;
2125     bits64 rem, term;
2126 
2127     aSig = extractFloat32Frac( a );
2128     aExp = extractFloat32Exp( a );
2129     aSign = extractFloat32Sign( a );
2130     if ( aExp == 0xFF ) {
2131         if ( aSig ) return propagateFloat32NaN( a, 0 );
2132         if ( ! aSign ) return a;
2133         float_raise( float_flag_invalid );
2134         return float32_default_nan;
2135     }
2136     if ( aSign ) {
2137         if ( ( aExp | aSig ) == 0 ) return a;
2138         float_raise( float_flag_invalid );
2139         return float32_default_nan;
2140     }
2141     if ( aExp == 0 ) {
2142         if ( aSig == 0 ) return 0;
2143         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2144     }
2145     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2146     aSig = ( aSig | 0x00800000 )<<8;
2147     zSig = estimateSqrt32( aExp, aSig ) + 2;
2148     if ( ( zSig & 0x7F ) <= 5 ) {
2149         if ( zSig < 2 ) {
2150             zSig = 0x7FFFFFFF;
2151             goto roundAndPack;
2152         }
2153         aSig >>= aExp & 1;
2154         term = ( (bits64) zSig ) * zSig;
2155         rem = ( ( (bits64) aSig )<<32 ) - term;
2156         while ( (sbits64) rem < 0 ) {
2157             --zSig;
2158             rem += ( ( (bits64) zSig )<<1 ) | 1;
2159         }
2160         zSig |= ( rem != 0 );
2161     }
2162     shift32RightJamming( zSig, 1, &zSig );
2163  roundAndPack:
2164     return roundAndPackFloat32( 0, zExp, zSig );
2165 
2166 }
2167 #endif /* !SOFTFLOAT_FOR_GCC */
2168 
2169 /*
2170 -------------------------------------------------------------------------------
2171 Returns 1 if the single-precision floating-point value `a' is equal to
2172 the corresponding value `b', and 0 otherwise.  The comparison is performed
2173 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2174 -------------------------------------------------------------------------------
2175 */
2176 flag float32_eq( float32 a, float32 b )
2177 {
2178 
2179     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2180          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2181        ) {
2182         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2183             float_raise( float_flag_invalid );
2184         }
2185         return 0;
2186     }
2187     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2188 
2189 }
2190 
2191 /*
2192 -------------------------------------------------------------------------------
2193 Returns 1 if the single-precision floating-point value `a' is less than
2194 or equal to the corresponding value `b', and 0 otherwise.  The comparison
2195 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2196 Arithmetic.
2197 -------------------------------------------------------------------------------
2198 */
2199 flag float32_le( float32 a, float32 b )
2200 {
2201     flag aSign, bSign;
2202 
2203     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2204          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2205        ) {
2206         float_raise( float_flag_invalid );
2207         return 0;
2208     }
2209     aSign = extractFloat32Sign( a );
2210     bSign = extractFloat32Sign( b );
2211     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2212     return ( a == b ) || ( aSign ^ ( a < b ) );
2213 
2214 }
2215 
2216 /*
2217 -------------------------------------------------------------------------------
2218 Returns 1 if the single-precision floating-point value `a' is less than
2219 the corresponding value `b', and 0 otherwise.  The comparison is performed
2220 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2221 -------------------------------------------------------------------------------
2222 */
2223 flag float32_lt( float32 a, float32 b )
2224 {
2225     flag aSign, bSign;
2226 
2227     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2228          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2229        ) {
2230         float_raise( float_flag_invalid );
2231         return 0;
2232     }
2233     aSign = extractFloat32Sign( a );
2234     bSign = extractFloat32Sign( b );
2235     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2236     return ( a != b ) && ( aSign ^ ( a < b ) );
2237 
2238 }
2239 
2240 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2241 /*
2242 -------------------------------------------------------------------------------
2243 Returns 1 if the single-precision floating-point value `a' is equal to
2244 the corresponding value `b', and 0 otherwise.  The invalid exception is
2245 raised if either operand is a NaN.  Otherwise, the comparison is performed
2246 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2247 -------------------------------------------------------------------------------
2248 */
2249 flag float32_eq_signaling( float32 a, float32 b )
2250 {
2251 
2252     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2253          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2254        ) {
2255         float_raise( float_flag_invalid );
2256         return 0;
2257     }
2258     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2259 
2260 }
2261 
2262 /*
2263 -------------------------------------------------------------------------------
2264 Returns 1 if the single-precision floating-point value `a' is less than or
2265 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2266 cause an exception.  Otherwise, the comparison is performed according to the
2267 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2268 -------------------------------------------------------------------------------
2269 */
2270 flag float32_le_quiet( float32 a, float32 b )
2271 {
2272     flag aSign, bSign;
2273 
2274     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2275          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2276        ) {
2277         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2278             float_raise( float_flag_invalid );
2279         }
2280         return 0;
2281     }
2282     aSign = extractFloat32Sign( a );
2283     bSign = extractFloat32Sign( b );
2284     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2285     return ( a == b ) || ( aSign ^ ( a < b ) );
2286 
2287 }
2288 
2289 /*
2290 -------------------------------------------------------------------------------
2291 Returns 1 if the single-precision floating-point value `a' is less than
2292 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2293 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2294 Standard for Binary Floating-Point Arithmetic.
2295 -------------------------------------------------------------------------------
2296 */
2297 flag float32_lt_quiet( float32 a, float32 b )
2298 {
2299     flag aSign, bSign;
2300 
2301     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2302          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2303        ) {
2304         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2305             float_raise( float_flag_invalid );
2306         }
2307         return 0;
2308     }
2309     aSign = extractFloat32Sign( a );
2310     bSign = extractFloat32Sign( b );
2311     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2312     return ( a != b ) && ( aSign ^ ( a < b ) );
2313 
2314 }
2315 #endif /* !SOFTFLOAT_FOR_GCC */
2316 
2317 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2318 /*
2319 -------------------------------------------------------------------------------
2320 Returns the result of converting the double-precision floating-point value
2321 `a' to the 32-bit two's complement integer format.  The conversion is
2322 performed according to the IEC/IEEE Standard for Binary Floating-Point
2323 Arithmetic---which means in particular that the conversion is rounded
2324 according to the current rounding mode.  If `a' is a NaN, the largest
2325 positive integer is returned.  Otherwise, if the conversion overflows, the
2326 largest integer with the same sign as `a' is returned.
2327 -------------------------------------------------------------------------------
2328 */
2329 int32 float64_to_int32( float64 a )
2330 {
2331     flag aSign;
2332     int16 aExp, shiftCount;
2333     bits64 aSig;
2334 
2335     aSig = extractFloat64Frac( a );
2336     aExp = extractFloat64Exp( a );
2337     aSign = extractFloat64Sign( a );
2338     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2339     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2340     shiftCount = 0x42C - aExp;
2341     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2342     return roundAndPackInt32( aSign, aSig );
2343 
2344 }
2345 #endif /* !SOFTFLOAT_FOR_GCC */
2346 
2347 /*
2348 -------------------------------------------------------------------------------
2349 Returns the result of converting the double-precision floating-point value
2350 `a' to the 32-bit two's complement integer format.  The conversion is
2351 performed according to the IEC/IEEE Standard for Binary Floating-Point
2352 Arithmetic, except that the conversion is always rounded toward zero.
2353 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2354 the conversion overflows, the largest integer with the same sign as `a' is
2355 returned.
2356 -------------------------------------------------------------------------------
2357 */
2358 int32 float64_to_int32_round_to_zero( float64 a )
2359 {
2360     flag aSign;
2361     int16 aExp, shiftCount;
2362     bits64 aSig, savedASig;
2363     int32 z;
2364 
2365     aSig = extractFloat64Frac( a );
2366     aExp = extractFloat64Exp( a );
2367     aSign = extractFloat64Sign( a );
2368     if ( 0x41E < aExp ) {
2369         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2370         goto invalid;
2371     }
2372     else if ( aExp < 0x3FF ) {
2373         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2374         return 0;
2375     }
2376     aSig |= LIT64( 0x0010000000000000 );
2377     shiftCount = 0x433 - aExp;
2378     savedASig = aSig;
2379     aSig >>= shiftCount;
2380     z = aSig;
2381     if ( aSign ) z = - z;
2382     if ( ( z < 0 ) ^ aSign ) {
2383  invalid:
2384         float_raise( float_flag_invalid );
2385         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2386     }
2387     if ( ( aSig<<shiftCount ) != savedASig ) {
2388         float_exception_flags |= float_flag_inexact;
2389     }
2390     return z;
2391 
2392 }
2393 
2394 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2395 /*
2396 -------------------------------------------------------------------------------
2397 Returns the result of converting the double-precision floating-point value
2398 `a' to the 64-bit two's complement integer format.  The conversion is
2399 performed according to the IEC/IEEE Standard for Binary Floating-Point
2400 Arithmetic---which means in particular that the conversion is rounded
2401 according to the current rounding mode.  If `a' is a NaN, the largest
2402 positive integer is returned.  Otherwise, if the conversion overflows, the
2403 largest integer with the same sign as `a' is returned.
2404 -------------------------------------------------------------------------------
2405 */
2406 int64 float64_to_int64( float64 a )
2407 {
2408     flag aSign;
2409     int16 aExp, shiftCount;
2410     bits64 aSig, aSigExtra;
2411 
2412     aSig = extractFloat64Frac( a );
2413     aExp = extractFloat64Exp( a );
2414     aSign = extractFloat64Sign( a );
2415     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2416     shiftCount = 0x433 - aExp;
2417     if ( shiftCount <= 0 ) {
2418         if ( 0x43E < aExp ) {
2419             float_raise( float_flag_invalid );
2420             if (    ! aSign
2421                  || (    ( aExp == 0x7FF )
2422                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2423                ) {
2424                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2425             }
2426             return (sbits64) LIT64( 0x8000000000000000 );
2427         }
2428         aSigExtra = 0;
2429         aSig <<= - shiftCount;
2430     }
2431     else {
2432         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2433     }
2434     return roundAndPackInt64( aSign, aSig, aSigExtra );
2435 
2436 }
2437 
2438 /*
2439 -------------------------------------------------------------------------------
2440 Returns the result of converting the double-precision floating-point value
2441 `a' to the 64-bit two's complement integer format.  The conversion is
2442 performed according to the IEC/IEEE Standard for Binary Floating-Point
2443 Arithmetic, except that the conversion is always rounded toward zero.
2444 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2445 the conversion overflows, the largest integer with the same sign as `a' is
2446 returned.
2447 -------------------------------------------------------------------------------
2448 */
2449 int64 float64_to_int64_round_to_zero( float64 a )
2450 {
2451     flag aSign;
2452     int16 aExp, shiftCount;
2453     bits64 aSig;
2454     int64 z;
2455 
2456     aSig = extractFloat64Frac( a );
2457     aExp = extractFloat64Exp( a );
2458     aSign = extractFloat64Sign( a );
2459     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2460     shiftCount = aExp - 0x433;
2461     if ( 0 <= shiftCount ) {
2462         if ( 0x43E <= aExp ) {
2463             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2464                 float_raise( float_flag_invalid );
2465                 if (    ! aSign
2466                      || (    ( aExp == 0x7FF )
2467                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2468                    ) {
2469                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2470                 }
2471             }
2472             return (sbits64) LIT64( 0x8000000000000000 );
2473         }
2474         z = aSig<<shiftCount;
2475     }
2476     else {
2477         if ( aExp < 0x3FE ) {
2478             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2479             return 0;
2480         }
2481         z = aSig>>( - shiftCount );
2482         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2483             float_exception_flags |= float_flag_inexact;
2484         }
2485     }
2486     if ( aSign ) z = - z;
2487     return z;
2488 
2489 }
2490 #endif /* !SOFTFLOAT_FOR_GCC */
2491 
2492 /*
2493 -------------------------------------------------------------------------------
2494 Returns the result of converting the double-precision floating-point value
2495 `a' to the single-precision floating-point format.  The conversion is
2496 performed according to the IEC/IEEE Standard for Binary Floating-Point
2497 Arithmetic.
2498 -------------------------------------------------------------------------------
2499 */
2500 float32 float64_to_float32( float64 a )
2501 {
2502     flag aSign;
2503     int16 aExp;
2504     bits64 aSig;
2505     bits32 zSig;
2506 
2507     aSig = extractFloat64Frac( a );
2508     aExp = extractFloat64Exp( a );
2509     aSign = extractFloat64Sign( a );
2510     if ( aExp == 0x7FF ) {
2511         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2512         return packFloat32( aSign, 0xFF, 0 );
2513     }
2514     shift64RightJamming( aSig, 22, &aSig );
2515     zSig = aSig;
2516     if ( aExp || zSig ) {
2517         zSig |= 0x40000000;
2518         aExp -= 0x381;
2519     }
2520     return roundAndPackFloat32( aSign, aExp, zSig );
2521 
2522 }
2523 
2524 #ifdef FLOATX80
2525 
2526 /*
2527 -------------------------------------------------------------------------------
2528 Returns the result of converting the double-precision floating-point value
2529 `a' to the extended double-precision floating-point format.  The conversion
2530 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2531 Arithmetic.
2532 -------------------------------------------------------------------------------
2533 */
2534 floatx80 float64_to_floatx80( float64 a )
2535 {
2536     flag aSign;
2537     int16 aExp;
2538     bits64 aSig;
2539 
2540     aSig = extractFloat64Frac( a );
2541     aExp = extractFloat64Exp( a );
2542     aSign = extractFloat64Sign( a );
2543     if ( aExp == 0x7FF ) {
2544         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2545         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2546     }
2547     if ( aExp == 0 ) {
2548         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2549         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2550     }
2551     return
2552         packFloatx80(
2553             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2554 
2555 }
2556 
2557 #endif
2558 
2559 #ifdef FLOAT128
2560 
2561 /*
2562 -------------------------------------------------------------------------------
2563 Returns the result of converting the double-precision floating-point value
2564 `a' to the quadruple-precision floating-point format.  The conversion is
2565 performed according to the IEC/IEEE Standard for Binary Floating-Point
2566 Arithmetic.
2567 -------------------------------------------------------------------------------
2568 */
2569 float128 float64_to_float128( float64 a )
2570 {
2571     flag aSign;
2572     int16 aExp;
2573     bits64 aSig, zSig0, zSig1;
2574 
2575     aSig = extractFloat64Frac( a );
2576     aExp = extractFloat64Exp( a );
2577     aSign = extractFloat64Sign( a );
2578     if ( aExp == 0x7FF ) {
2579         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2580         return packFloat128( aSign, 0x7FFF, 0, 0 );
2581     }
2582     if ( aExp == 0 ) {
2583         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2584         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2585         --aExp;
2586     }
2587     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2588     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2589 
2590 }
2591 
2592 #endif
2593 
2594 #ifndef SOFTFLOAT_FOR_GCC
2595 /*
2596 -------------------------------------------------------------------------------
2597 Rounds the double-precision floating-point value `a' to an integer, and
2598 returns the result as a double-precision floating-point value.  The
2599 operation is performed according to the IEC/IEEE Standard for Binary
2600 Floating-Point Arithmetic.
2601 -------------------------------------------------------------------------------
2602 */
2603 float64 float64_round_to_int( float64 a )
2604 {
2605     flag aSign;
2606     int16 aExp;
2607     bits64 lastBitMask, roundBitsMask;
2608     int8 roundingMode;
2609     float64 z;
2610 
2611     aExp = extractFloat64Exp( a );
2612     if ( 0x433 <= aExp ) {
2613         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2614             return propagateFloat64NaN( a, a );
2615         }
2616         return a;
2617     }
2618     if ( aExp < 0x3FF ) {
2619         if ( (bits64) ( a<<1 ) == 0 ) return a;
2620         float_exception_flags |= float_flag_inexact;
2621         aSign = extractFloat64Sign( a );
2622         switch ( float_rounding_mode ) {
2623          case float_round_nearest_even:
2624             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2625                 return packFloat64( aSign, 0x3FF, 0 );
2626             }
2627             break;
2628 	 case float_round_to_zero:
2629 	    break;
2630          case float_round_down:
2631             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2632          case float_round_up:
2633             return
2634             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2635         }
2636         return packFloat64( aSign, 0, 0 );
2637     }
2638     lastBitMask = 1;
2639     lastBitMask <<= 0x433 - aExp;
2640     roundBitsMask = lastBitMask - 1;
2641     z = a;
2642     roundingMode = float_rounding_mode;
2643     if ( roundingMode == float_round_nearest_even ) {
2644         z += lastBitMask>>1;
2645         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2646     }
2647     else if ( roundingMode != float_round_to_zero ) {
2648         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2649             z += roundBitsMask;
2650         }
2651     }
2652     z &= ~ roundBitsMask;
2653     if ( z != a ) float_exception_flags |= float_flag_inexact;
2654     return z;
2655 
2656 }
2657 #endif
2658 
2659 /*
2660 -------------------------------------------------------------------------------
2661 Returns the result of adding the absolute values of the double-precision
2662 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2663 before being returned.  `zSign' is ignored if the result is a NaN.
2664 The addition is performed according to the IEC/IEEE Standard for Binary
2665 Floating-Point Arithmetic.
2666 -------------------------------------------------------------------------------
2667 */
2668 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2669 {
2670     int16 aExp, bExp, zExp;
2671     bits64 aSig, bSig, zSig;
2672     int16 expDiff;
2673 
2674     aSig = extractFloat64Frac( a );
2675     aExp = extractFloat64Exp( a );
2676     bSig = extractFloat64Frac( b );
2677     bExp = extractFloat64Exp( b );
2678     expDiff = aExp - bExp;
2679     aSig <<= 9;
2680     bSig <<= 9;
2681     if ( 0 < expDiff ) {
2682         if ( aExp == 0x7FF ) {
2683             if ( aSig ) return propagateFloat64NaN( a, b );
2684             return a;
2685         }
2686         if ( bExp == 0 ) {
2687             --expDiff;
2688         }
2689         else {
2690             bSig |= LIT64( 0x2000000000000000 );
2691         }
2692         shift64RightJamming( bSig, expDiff, &bSig );
2693         zExp = aExp;
2694     }
2695     else if ( expDiff < 0 ) {
2696         if ( bExp == 0x7FF ) {
2697             if ( bSig ) return propagateFloat64NaN( a, b );
2698             return packFloat64( zSign, 0x7FF, 0 );
2699         }
2700         if ( aExp == 0 ) {
2701             ++expDiff;
2702         }
2703         else {
2704             aSig |= LIT64( 0x2000000000000000 );
2705         }
2706         shift64RightJamming( aSig, - expDiff, &aSig );
2707         zExp = bExp;
2708     }
2709     else {
2710         if ( aExp == 0x7FF ) {
2711             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2712             return a;
2713         }
2714         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2715         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2716         zExp = aExp;
2717         goto roundAndPack;
2718     }
2719     aSig |= LIT64( 0x2000000000000000 );
2720     zSig = ( aSig + bSig )<<1;
2721     --zExp;
2722     if ( (sbits64) zSig < 0 ) {
2723         zSig = aSig + bSig;
2724         ++zExp;
2725     }
2726  roundAndPack:
2727     return roundAndPackFloat64( zSign, zExp, zSig );
2728 
2729 }
2730 
2731 /*
2732 -------------------------------------------------------------------------------
2733 Returns the result of subtracting the absolute values of the double-
2734 precision floating-point values `a' and `b'.  If `zSign' is 1, the
2735 difference is negated before being returned.  `zSign' is ignored if the
2736 result is a NaN.  The subtraction is performed according to the IEC/IEEE
2737 Standard for Binary Floating-Point Arithmetic.
2738 -------------------------------------------------------------------------------
2739 */
2740 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2741 {
2742     int16 aExp, bExp, zExp;
2743     bits64 aSig, bSig, zSig;
2744     int16 expDiff;
2745 
2746     aSig = extractFloat64Frac( a );
2747     aExp = extractFloat64Exp( a );
2748     bSig = extractFloat64Frac( b );
2749     bExp = extractFloat64Exp( b );
2750     expDiff = aExp - bExp;
2751     aSig <<= 10;
2752     bSig <<= 10;
2753     if ( 0 < expDiff ) goto aExpBigger;
2754     if ( expDiff < 0 ) goto bExpBigger;
2755     if ( aExp == 0x7FF ) {
2756         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2757         float_raise( float_flag_invalid );
2758         return float64_default_nan;
2759     }
2760     if ( aExp == 0 ) {
2761         aExp = 1;
2762         bExp = 1;
2763     }
2764     if ( bSig < aSig ) goto aBigger;
2765     if ( aSig < bSig ) goto bBigger;
2766     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2767  bExpBigger:
2768     if ( bExp == 0x7FF ) {
2769         if ( bSig ) return propagateFloat64NaN( a, b );
2770         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2771     }
2772     if ( aExp == 0 ) {
2773         ++expDiff;
2774     }
2775     else {
2776         aSig |= LIT64( 0x4000000000000000 );
2777     }
2778     shift64RightJamming( aSig, - expDiff, &aSig );
2779     bSig |= LIT64( 0x4000000000000000 );
2780  bBigger:
2781     zSig = bSig - aSig;
2782     zExp = bExp;
2783     zSign ^= 1;
2784     goto normalizeRoundAndPack;
2785  aExpBigger:
2786     if ( aExp == 0x7FF ) {
2787         if ( aSig ) return propagateFloat64NaN( a, b );
2788         return a;
2789     }
2790     if ( bExp == 0 ) {
2791         --expDiff;
2792     }
2793     else {
2794         bSig |= LIT64( 0x4000000000000000 );
2795     }
2796     shift64RightJamming( bSig, expDiff, &bSig );
2797     aSig |= LIT64( 0x4000000000000000 );
2798  aBigger:
2799     zSig = aSig - bSig;
2800     zExp = aExp;
2801  normalizeRoundAndPack:
2802     --zExp;
2803     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2804 
2805 }
2806 
2807 /*
2808 -------------------------------------------------------------------------------
2809 Returns the result of adding the double-precision floating-point values `a'
2810 and `b'.  The operation is performed according to the IEC/IEEE Standard for
2811 Binary Floating-Point Arithmetic.
2812 -------------------------------------------------------------------------------
2813 */
2814 float64 float64_add( float64 a, float64 b )
2815 {
2816     flag aSign, bSign;
2817 
2818     aSign = extractFloat64Sign( a );
2819     bSign = extractFloat64Sign( b );
2820     if ( aSign == bSign ) {
2821         return addFloat64Sigs( a, b, aSign );
2822     }
2823     else {
2824         return subFloat64Sigs( a, b, aSign );
2825     }
2826 
2827 }
2828 
2829 /*
2830 -------------------------------------------------------------------------------
2831 Returns the result of subtracting the double-precision floating-point values
2832 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2833 for Binary Floating-Point Arithmetic.
2834 -------------------------------------------------------------------------------
2835 */
2836 float64 float64_sub( float64 a, float64 b )
2837 {
2838     flag aSign, bSign;
2839 
2840     aSign = extractFloat64Sign( a );
2841     bSign = extractFloat64Sign( b );
2842     if ( aSign == bSign ) {
2843         return subFloat64Sigs( a, b, aSign );
2844     }
2845     else {
2846         return addFloat64Sigs( a, b, aSign );
2847     }
2848 
2849 }
2850 
2851 /*
2852 -------------------------------------------------------------------------------
2853 Returns the result of multiplying the double-precision floating-point values
2854 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2855 for Binary Floating-Point Arithmetic.
2856 -------------------------------------------------------------------------------
2857 */
2858 float64 float64_mul( float64 a, float64 b )
2859 {
2860     flag aSign, bSign, zSign;
2861     int16 aExp, bExp, zExp;
2862     bits64 aSig, bSig, zSig0, zSig1;
2863 
2864     aSig = extractFloat64Frac( a );
2865     aExp = extractFloat64Exp( a );
2866     aSign = extractFloat64Sign( a );
2867     bSig = extractFloat64Frac( b );
2868     bExp = extractFloat64Exp( b );
2869     bSign = extractFloat64Sign( b );
2870     zSign = aSign ^ bSign;
2871     if ( aExp == 0x7FF ) {
2872         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2873             return propagateFloat64NaN( a, b );
2874         }
2875         if ( ( bExp | bSig ) == 0 ) {
2876             float_raise( float_flag_invalid );
2877             return float64_default_nan;
2878         }
2879         return packFloat64( zSign, 0x7FF, 0 );
2880     }
2881     if ( bExp == 0x7FF ) {
2882         if ( bSig ) return propagateFloat64NaN( a, b );
2883         if ( ( aExp | aSig ) == 0 ) {
2884             float_raise( float_flag_invalid );
2885             return float64_default_nan;
2886         }
2887         return packFloat64( zSign, 0x7FF, 0 );
2888     }
2889     if ( aExp == 0 ) {
2890         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2891         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2892     }
2893     if ( bExp == 0 ) {
2894         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2895         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2896     }
2897     zExp = aExp + bExp - 0x3FF;
2898     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2899     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2900     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2901     zSig0 |= ( zSig1 != 0 );
2902     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2903         zSig0 <<= 1;
2904         --zExp;
2905     }
2906     return roundAndPackFloat64( zSign, zExp, zSig0 );
2907 
2908 }
2909 
2910 /*
2911 -------------------------------------------------------------------------------
2912 Returns the result of dividing the double-precision floating-point value `a'
2913 by the corresponding value `b'.  The operation is performed according to
2914 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2915 -------------------------------------------------------------------------------
2916 */
2917 float64 float64_div( float64 a, float64 b )
2918 {
2919     flag aSign, bSign, zSign;
2920     int16 aExp, bExp, zExp;
2921     bits64 aSig, bSig, zSig;
2922     bits64 rem0, rem1;
2923     bits64 term0, term1;
2924 
2925     aSig = extractFloat64Frac( a );
2926     aExp = extractFloat64Exp( a );
2927     aSign = extractFloat64Sign( a );
2928     bSig = extractFloat64Frac( b );
2929     bExp = extractFloat64Exp( b );
2930     bSign = extractFloat64Sign( b );
2931     zSign = aSign ^ bSign;
2932     if ( aExp == 0x7FF ) {
2933         if ( aSig ) return propagateFloat64NaN( a, b );
2934         if ( bExp == 0x7FF ) {
2935             if ( bSig ) return propagateFloat64NaN( a, b );
2936             float_raise( float_flag_invalid );
2937             return float64_default_nan;
2938         }
2939         return packFloat64( zSign, 0x7FF, 0 );
2940     }
2941     if ( bExp == 0x7FF ) {
2942         if ( bSig ) return propagateFloat64NaN( a, b );
2943         return packFloat64( zSign, 0, 0 );
2944     }
2945     if ( bExp == 0 ) {
2946         if ( bSig == 0 ) {
2947             if ( ( aExp | aSig ) == 0 ) {
2948                 float_raise( float_flag_invalid );
2949                 return float64_default_nan;
2950             }
2951             float_raise( float_flag_divbyzero );
2952             return packFloat64( zSign, 0x7FF, 0 );
2953         }
2954         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2955     }
2956     if ( aExp == 0 ) {
2957         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2958         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2959     }
2960     zExp = aExp - bExp + 0x3FD;
2961     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2962     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2963     if ( bSig <= ( aSig + aSig ) ) {
2964         aSig >>= 1;
2965         ++zExp;
2966     }
2967     zSig = estimateDiv128To64( aSig, 0, bSig );
2968     if ( ( zSig & 0x1FF ) <= 2 ) {
2969         mul64To128( bSig, zSig, &term0, &term1 );
2970         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2971         while ( (sbits64) rem0 < 0 ) {
2972             --zSig;
2973             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2974         }
2975         zSig |= ( rem1 != 0 );
2976     }
2977     return roundAndPackFloat64( zSign, zExp, zSig );
2978 
2979 }
2980 
2981 #ifndef SOFTFLOAT_FOR_GCC
2982 /*
2983 -------------------------------------------------------------------------------
2984 Returns the remainder of the double-precision floating-point value `a'
2985 with respect to the corresponding value `b'.  The operation is performed
2986 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2987 -------------------------------------------------------------------------------
2988 */
2989 float64 float64_rem( float64 a, float64 b )
2990 {
2991     flag aSign, bSign, zSign;
2992     int16 aExp, bExp, expDiff;
2993     bits64 aSig, bSig;
2994     bits64 q, alternateASig;
2995     sbits64 sigMean;
2996 
2997     aSig = extractFloat64Frac( a );
2998     aExp = extractFloat64Exp( a );
2999     aSign = extractFloat64Sign( a );
3000     bSig = extractFloat64Frac( b );
3001     bExp = extractFloat64Exp( b );
3002     bSign = extractFloat64Sign( b );
3003     if ( aExp == 0x7FF ) {
3004         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3005             return propagateFloat64NaN( a, b );
3006         }
3007         float_raise( float_flag_invalid );
3008         return float64_default_nan;
3009     }
3010     if ( bExp == 0x7FF ) {
3011         if ( bSig ) return propagateFloat64NaN( a, b );
3012         return a;
3013     }
3014     if ( bExp == 0 ) {
3015         if ( bSig == 0 ) {
3016             float_raise( float_flag_invalid );
3017             return float64_default_nan;
3018         }
3019         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3020     }
3021     if ( aExp == 0 ) {
3022         if ( aSig == 0 ) return a;
3023         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3024     }
3025     expDiff = aExp - bExp;
3026     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3027     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3028     if ( expDiff < 0 ) {
3029         if ( expDiff < -1 ) return a;
3030         aSig >>= 1;
3031     }
3032     q = ( bSig <= aSig );
3033     if ( q ) aSig -= bSig;
3034     expDiff -= 64;
3035     while ( 0 < expDiff ) {
3036         q = estimateDiv128To64( aSig, 0, bSig );
3037         q = ( 2 < q ) ? q - 2 : 0;
3038         aSig = - ( ( bSig>>2 ) * q );
3039         expDiff -= 62;
3040     }
3041     expDiff += 64;
3042     if ( 0 < expDiff ) {
3043         q = estimateDiv128To64( aSig, 0, bSig );
3044         q = ( 2 < q ) ? q - 2 : 0;
3045         q >>= 64 - expDiff;
3046         bSig >>= 2;
3047         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3048     }
3049     else {
3050         aSig >>= 2;
3051         bSig >>= 2;
3052     }
3053     do {
3054         alternateASig = aSig;
3055         ++q;
3056         aSig -= bSig;
3057     } while ( 0 <= (sbits64) aSig );
3058     sigMean = aSig + alternateASig;
3059     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3060         aSig = alternateASig;
3061     }
3062     zSign = ( (sbits64) aSig < 0 );
3063     if ( zSign ) aSig = - aSig;
3064     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3065 
3066 }
3067 
3068 /*
3069 -------------------------------------------------------------------------------
3070 Returns the square root of the double-precision floating-point value `a'.
3071 The operation is performed according to the IEC/IEEE Standard for Binary
3072 Floating-Point Arithmetic.
3073 -------------------------------------------------------------------------------
3074 */
3075 float64 float64_sqrt( float64 a )
3076 {
3077     flag aSign;
3078     int16 aExp, zExp;
3079     bits64 aSig, zSig, doubleZSig;
3080     bits64 rem0, rem1, term0, term1;
3081 
3082     aSig = extractFloat64Frac( a );
3083     aExp = extractFloat64Exp( a );
3084     aSign = extractFloat64Sign( a );
3085     if ( aExp == 0x7FF ) {
3086         if ( aSig ) return propagateFloat64NaN( a, a );
3087         if ( ! aSign ) return a;
3088         float_raise( float_flag_invalid );
3089         return float64_default_nan;
3090     }
3091     if ( aSign ) {
3092         if ( ( aExp | aSig ) == 0 ) return a;
3093         float_raise( float_flag_invalid );
3094         return float64_default_nan;
3095     }
3096     if ( aExp == 0 ) {
3097         if ( aSig == 0 ) return 0;
3098         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3099     }
3100     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3101     aSig |= LIT64( 0x0010000000000000 );
3102     zSig = estimateSqrt32( aExp, aSig>>21 );
3103     aSig <<= 9 - ( aExp & 1 );
3104     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3105     if ( ( zSig & 0x1FF ) <= 5 ) {
3106         doubleZSig = zSig<<1;
3107         mul64To128( zSig, zSig, &term0, &term1 );
3108         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3109         while ( (sbits64) rem0 < 0 ) {
3110             --zSig;
3111             doubleZSig -= 2;
3112             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3113         }
3114         zSig |= ( ( rem0 | rem1 ) != 0 );
3115     }
3116     return roundAndPackFloat64( 0, zExp, zSig );
3117 
3118 }
3119 #endif
3120 
3121 /*
3122 -------------------------------------------------------------------------------
3123 Returns 1 if the double-precision floating-point value `a' is equal to the
3124 corresponding value `b', and 0 otherwise.  The comparison is performed
3125 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3126 -------------------------------------------------------------------------------
3127 */
3128 flag float64_eq( float64 a, float64 b )
3129 {
3130 
3131     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3132          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3133        ) {
3134         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3135             float_raise( float_flag_invalid );
3136         }
3137         return 0;
3138     }
3139     return ( a == b ) ||
3140 	( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3141 
3142 }
3143 
3144 /*
3145 -------------------------------------------------------------------------------
3146 Returns 1 if the double-precision floating-point value `a' is less than or
3147 equal to the corresponding value `b', and 0 otherwise.  The comparison is
3148 performed according to the IEC/IEEE Standard for Binary Floating-Point
3149 Arithmetic.
3150 -------------------------------------------------------------------------------
3151 */
3152 flag float64_le( float64 a, float64 b )
3153 {
3154     flag aSign, bSign;
3155 
3156     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3157          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3158        ) {
3159         float_raise( float_flag_invalid );
3160         return 0;
3161     }
3162     aSign = extractFloat64Sign( a );
3163     bSign = extractFloat64Sign( b );
3164     if ( aSign != bSign )
3165 	return aSign ||
3166 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3167 	      0 );
3168     return ( a == b ) ||
3169 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3170 
3171 }
3172 
3173 /*
3174 -------------------------------------------------------------------------------
3175 Returns 1 if the double-precision floating-point value `a' is less than
3176 the corresponding value `b', and 0 otherwise.  The comparison is performed
3177 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3178 -------------------------------------------------------------------------------
3179 */
3180 flag float64_lt( float64 a, float64 b )
3181 {
3182     flag aSign, bSign;
3183 
3184     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3185          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3186        ) {
3187         float_raise( float_flag_invalid );
3188         return 0;
3189     }
3190     aSign = extractFloat64Sign( a );
3191     bSign = extractFloat64Sign( b );
3192     if ( aSign != bSign )
3193 	return aSign &&
3194 	    ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3195 	      0 );
3196     return ( a != b ) &&
3197 	( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3198 
3199 }
3200 
3201 #ifndef SOFTFLOAT_FOR_GCC
3202 /*
3203 -------------------------------------------------------------------------------
3204 Returns 1 if the double-precision floating-point value `a' is equal to the
3205 corresponding value `b', and 0 otherwise.  The invalid exception is raised
3206 if either operand is a NaN.  Otherwise, the comparison is performed
3207 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3208 -------------------------------------------------------------------------------
3209 */
3210 flag float64_eq_signaling( float64 a, float64 b )
3211 {
3212 
3213     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3214          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3215        ) {
3216         float_raise( float_flag_invalid );
3217         return 0;
3218     }
3219     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3220 
3221 }
3222 
3223 /*
3224 -------------------------------------------------------------------------------
3225 Returns 1 if the double-precision floating-point value `a' is less than or
3226 equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3227 cause an exception.  Otherwise, the comparison is performed according to the
3228 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3229 -------------------------------------------------------------------------------
3230 */
3231 flag float64_le_quiet( float64 a, float64 b )
3232 {
3233     flag aSign, bSign;
3234 
3235     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3236          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3237        ) {
3238         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3239             float_raise( float_flag_invalid );
3240         }
3241         return 0;
3242     }
3243     aSign = extractFloat64Sign( a );
3244     bSign = extractFloat64Sign( b );
3245     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3246     return ( a == b ) || ( aSign ^ ( a < b ) );
3247 
3248 }
3249 
3250 /*
3251 -------------------------------------------------------------------------------
3252 Returns 1 if the double-precision floating-point value `a' is less than
3253 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3254 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3255 Standard for Binary Floating-Point Arithmetic.
3256 -------------------------------------------------------------------------------
3257 */
3258 flag float64_lt_quiet( float64 a, float64 b )
3259 {
3260     flag aSign, bSign;
3261 
3262     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3263          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3264        ) {
3265         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3266             float_raise( float_flag_invalid );
3267         }
3268         return 0;
3269     }
3270     aSign = extractFloat64Sign( a );
3271     bSign = extractFloat64Sign( b );
3272     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3273     return ( a != b ) && ( aSign ^ ( a < b ) );
3274 
3275 }
3276 #endif
3277 
3278 #ifdef FLOATX80
3279 
3280 /*
3281 -------------------------------------------------------------------------------
3282 Returns the result of converting the extended double-precision floating-
3283 point value `a' to the 32-bit two's complement integer format.  The
3284 conversion is performed according to the IEC/IEEE Standard for Binary
3285 Floating-Point Arithmetic---which means in particular that the conversion
3286 is rounded according to the current rounding mode.  If `a' is a NaN, the
3287 largest positive integer is returned.  Otherwise, if the conversion
3288 overflows, the largest integer with the same sign as `a' is returned.
3289 -------------------------------------------------------------------------------
3290 */
3291 int32 floatx80_to_int32( floatx80 a )
3292 {
3293     flag aSign;
3294     int32 aExp, shiftCount;
3295     bits64 aSig;
3296 
3297     aSig = extractFloatx80Frac( a );
3298     aExp = extractFloatx80Exp( a );
3299     aSign = extractFloatx80Sign( a );
3300     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3301     shiftCount = 0x4037 - aExp;
3302     if ( shiftCount <= 0 ) shiftCount = 1;
3303     shift64RightJamming( aSig, shiftCount, &aSig );
3304     return roundAndPackInt32( aSign, aSig );
3305 
3306 }
3307 
3308 /*
3309 -------------------------------------------------------------------------------
3310 Returns the result of converting the extended double-precision floating-
3311 point value `a' to the 32-bit two's complement integer format.  The
3312 conversion is performed according to the IEC/IEEE Standard for Binary
3313 Floating-Point Arithmetic, except that the conversion is always rounded
3314 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3315 Otherwise, if the conversion overflows, the largest integer with the same
3316 sign as `a' is returned.
3317 -------------------------------------------------------------------------------
3318 */
3319 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3320 {
3321     flag aSign;
3322     int32 aExp, shiftCount;
3323     bits64 aSig, savedASig;
3324     int32 z;
3325 
3326     aSig = extractFloatx80Frac( a );
3327     aExp = extractFloatx80Exp( a );
3328     aSign = extractFloatx80Sign( a );
3329     if ( 0x401E < aExp ) {
3330         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3331         goto invalid;
3332     }
3333     else if ( aExp < 0x3FFF ) {
3334         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3335         return 0;
3336     }
3337     shiftCount = 0x403E - aExp;
3338     savedASig = aSig;
3339     aSig >>= shiftCount;
3340     z = aSig;
3341     if ( aSign ) z = - z;
3342     if ( ( z < 0 ) ^ aSign ) {
3343  invalid:
3344         float_raise( float_flag_invalid );
3345         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3346     }
3347     if ( ( aSig<<shiftCount ) != savedASig ) {
3348         float_exception_flags |= float_flag_inexact;
3349     }
3350     return z;
3351 
3352 }
3353 
3354 /*
3355 -------------------------------------------------------------------------------
3356 Returns the result of converting the extended double-precision floating-
3357 point value `a' to the 64-bit two's complement integer format.  The
3358 conversion is performed according to the IEC/IEEE Standard for Binary
3359 Floating-Point Arithmetic---which means in particular that the conversion
3360 is rounded according to the current rounding mode.  If `a' is a NaN,
3361 the largest positive integer is returned.  Otherwise, if the conversion
3362 overflows, the largest integer with the same sign as `a' is returned.
3363 -------------------------------------------------------------------------------
3364 */
3365 int64 floatx80_to_int64( floatx80 a )
3366 {
3367     flag aSign;
3368     int32 aExp, shiftCount;
3369     bits64 aSig, aSigExtra;
3370 
3371     aSig = extractFloatx80Frac( a );
3372     aExp = extractFloatx80Exp( a );
3373     aSign = extractFloatx80Sign( a );
3374     shiftCount = 0x403E - aExp;
3375     if ( shiftCount <= 0 ) {
3376         if ( shiftCount ) {
3377             float_raise( float_flag_invalid );
3378             if (    ! aSign
3379                  || (    ( aExp == 0x7FFF )
3380                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3381                ) {
3382                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3383             }
3384             return (sbits64) LIT64( 0x8000000000000000 );
3385         }
3386         aSigExtra = 0;
3387     }
3388     else {
3389         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3390     }
3391     return roundAndPackInt64( aSign, aSig, aSigExtra );
3392 
3393 }
3394 
3395 /*
3396 -------------------------------------------------------------------------------
3397 Returns the result of converting the extended double-precision floating-
3398 point value `a' to the 64-bit two's complement integer format.  The
3399 conversion is performed according to the IEC/IEEE Standard for Binary
3400 Floating-Point Arithmetic, except that the conversion is always rounded
3401 toward zero.  If `a' is a NaN, the largest positive integer is returned.
3402 Otherwise, if the conversion overflows, the largest integer with the same
3403 sign as `a' is returned.
3404 -------------------------------------------------------------------------------
3405 */
3406 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3407 {
3408     flag aSign;
3409     int32 aExp, shiftCount;
3410     bits64 aSig;
3411     int64 z;
3412 
3413     aSig = extractFloatx80Frac( a );
3414     aExp = extractFloatx80Exp( a );
3415     aSign = extractFloatx80Sign( a );
3416     shiftCount = aExp - 0x403E;
3417     if ( 0 <= shiftCount ) {
3418         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3419         if ( ( a.high != 0xC03E ) || aSig ) {
3420             float_raise( float_flag_invalid );
3421             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3422                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3423             }
3424         }
3425         return (sbits64) LIT64( 0x8000000000000000 );
3426     }
3427     else if ( aExp < 0x3FFF ) {
3428         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3429         return 0;
3430     }
3431     z = aSig>>( - shiftCount );
3432     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3433         float_exception_flags |= float_flag_inexact;
3434     }
3435     if ( aSign ) z = - z;
3436     return z;
3437 
3438 }
3439 
3440 /*
3441 -------------------------------------------------------------------------------
3442 Returns the result of converting the extended double-precision floating-
3443 point value `a' to the single-precision floating-point format.  The
3444 conversion is performed according to the IEC/IEEE Standard for Binary
3445 Floating-Point Arithmetic.
3446 -------------------------------------------------------------------------------
3447 */
3448 float32 floatx80_to_float32( floatx80 a )
3449 {
3450     flag aSign;
3451     int32 aExp;
3452     bits64 aSig;
3453 
3454     aSig = extractFloatx80Frac( a );
3455     aExp = extractFloatx80Exp( a );
3456     aSign = extractFloatx80Sign( a );
3457     if ( aExp == 0x7FFF ) {
3458         if ( (bits64) ( aSig<<1 ) ) {
3459             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3460         }
3461         return packFloat32( aSign, 0xFF, 0 );
3462     }
3463     shift64RightJamming( aSig, 33, &aSig );
3464     if ( aExp || aSig ) aExp -= 0x3F81;
3465     return roundAndPackFloat32( aSign, aExp, aSig );
3466 
3467 }
3468 
3469 /*
3470 -------------------------------------------------------------------------------
3471 Returns the result of converting the extended double-precision floating-
3472 point value `a' to the double-precision floating-point format.  The
3473 conversion is performed according to the IEC/IEEE Standard for Binary
3474 Floating-Point Arithmetic.
3475 -------------------------------------------------------------------------------
3476 */
3477 float64 floatx80_to_float64( floatx80 a )
3478 {
3479     flag aSign;
3480     int32 aExp;
3481     bits64 aSig, zSig;
3482 
3483     aSig = extractFloatx80Frac( a );
3484     aExp = extractFloatx80Exp( a );
3485     aSign = extractFloatx80Sign( a );
3486     if ( aExp == 0x7FFF ) {
3487         if ( (bits64) ( aSig<<1 ) ) {
3488             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3489         }
3490         return packFloat64( aSign, 0x7FF, 0 );
3491     }
3492     shift64RightJamming( aSig, 1, &zSig );
3493     if ( aExp || aSig ) aExp -= 0x3C01;
3494     return roundAndPackFloat64( aSign, aExp, zSig );
3495 
3496 }
3497 
3498 #ifdef FLOAT128
3499 
3500 /*
3501 -------------------------------------------------------------------------------
3502 Returns the result of converting the extended double-precision floating-
3503 point value `a' to the quadruple-precision floating-point format.  The
3504 conversion is performed according to the IEC/IEEE Standard for Binary
3505 Floating-Point Arithmetic.
3506 -------------------------------------------------------------------------------
3507 */
3508 float128 floatx80_to_float128( floatx80 a )
3509 {
3510     flag aSign;
3511     int16 aExp;
3512     bits64 aSig, zSig0, zSig1;
3513 
3514     aSig = extractFloatx80Frac( a );
3515     aExp = extractFloatx80Exp( a );
3516     aSign = extractFloatx80Sign( a );
3517     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3518         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3519     }
3520     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3521     return packFloat128( aSign, aExp, zSig0, zSig1 );
3522 
3523 }
3524 
3525 #endif
3526 
3527 /*
3528 -------------------------------------------------------------------------------
3529 Rounds the extended double-precision floating-point value `a' to an integer,
3530 and returns the result as an extended quadruple-precision floating-point
3531 value.  The operation is performed according to the IEC/IEEE Standard for
3532 Binary Floating-Point Arithmetic.
3533 -------------------------------------------------------------------------------
3534 */
3535 floatx80 floatx80_round_to_int( floatx80 a )
3536 {
3537     flag aSign;
3538     int32 aExp;
3539     bits64 lastBitMask, roundBitsMask;
3540     int8 roundingMode;
3541     floatx80 z;
3542 
3543     aExp = extractFloatx80Exp( a );
3544     if ( 0x403E <= aExp ) {
3545         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3546             return propagateFloatx80NaN( a, a );
3547         }
3548         return a;
3549     }
3550     if ( aExp < 0x3FFF ) {
3551         if (    ( aExp == 0 )
3552              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3553             return a;
3554         }
3555         float_exception_flags |= float_flag_inexact;
3556         aSign = extractFloatx80Sign( a );
3557         switch ( float_rounding_mode ) {
3558          case float_round_nearest_even:
3559             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3560                ) {
3561                 return
3562                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3563             }
3564             break;
3565 	 case float_round_to_zero:
3566 	    break;
3567          case float_round_down:
3568             return
3569                   aSign ?
3570                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3571                 : packFloatx80( 0, 0, 0 );
3572          case float_round_up:
3573             return
3574                   aSign ? packFloatx80( 1, 0, 0 )
3575                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3576         }
3577         return packFloatx80( aSign, 0, 0 );
3578     }
3579     lastBitMask = 1;
3580     lastBitMask <<= 0x403E - aExp;
3581     roundBitsMask = lastBitMask - 1;
3582     z = a;
3583     roundingMode = float_rounding_mode;
3584     if ( roundingMode == float_round_nearest_even ) {
3585         z.low += lastBitMask>>1;
3586         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3587     }
3588     else if ( roundingMode != float_round_to_zero ) {
3589         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3590             z.low += roundBitsMask;
3591         }
3592     }
3593     z.low &= ~ roundBitsMask;
3594     if ( z.low == 0 ) {
3595         ++z.high;
3596         z.low = LIT64( 0x8000000000000000 );
3597     }
3598     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3599     return z;
3600 
3601 }
3602 
3603 /*
3604 -------------------------------------------------------------------------------
3605 Returns the result of adding the absolute values of the extended double-
3606 precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3607 negated before being returned.  `zSign' is ignored if the result is a NaN.
3608 The addition is performed according to the IEC/IEEE Standard for Binary
3609 Floating-Point Arithmetic.
3610 -------------------------------------------------------------------------------
3611 */
3612 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3613 {
3614     int32 aExp, bExp, zExp;
3615     bits64 aSig, bSig, zSig0, zSig1;
3616     int32 expDiff;
3617 
3618     aSig = extractFloatx80Frac( a );
3619     aExp = extractFloatx80Exp( a );
3620     bSig = extractFloatx80Frac( b );
3621     bExp = extractFloatx80Exp( b );
3622     expDiff = aExp - bExp;
3623     if ( 0 < expDiff ) {
3624         if ( aExp == 0x7FFF ) {
3625             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3626             return a;
3627         }
3628         if ( bExp == 0 ) --expDiff;
3629         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3630         zExp = aExp;
3631     }
3632     else if ( expDiff < 0 ) {
3633         if ( bExp == 0x7FFF ) {
3634             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3635             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3636         }
3637         if ( aExp == 0 ) ++expDiff;
3638         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3639         zExp = bExp;
3640     }
3641     else {
3642         if ( aExp == 0x7FFF ) {
3643             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3644                 return propagateFloatx80NaN( a, b );
3645             }
3646             return a;
3647         }
3648         zSig1 = 0;
3649         zSig0 = aSig + bSig;
3650         if ( aExp == 0 ) {
3651             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3652             goto roundAndPack;
3653         }
3654         zExp = aExp;
3655         goto shiftRight1;
3656     }
3657     zSig0 = aSig + bSig;
3658     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3659  shiftRight1:
3660     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3661     zSig0 |= LIT64( 0x8000000000000000 );
3662     ++zExp;
3663  roundAndPack:
3664     return
3665         roundAndPackFloatx80(
3666             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3667 
3668 }
3669 
3670 /*
3671 -------------------------------------------------------------------------------
3672 Returns the result of subtracting the absolute values of the extended
3673 double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3674 difference is negated before being returned.  `zSign' is ignored if the
3675 result is a NaN.  The subtraction is performed according to the IEC/IEEE
3676 Standard for Binary Floating-Point Arithmetic.
3677 -------------------------------------------------------------------------------
3678 */
3679 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3680 {
3681     int32 aExp, bExp, zExp;
3682     bits64 aSig, bSig, zSig0, zSig1;
3683     int32 expDiff;
3684     floatx80 z;
3685 
3686     aSig = extractFloatx80Frac( a );
3687     aExp = extractFloatx80Exp( a );
3688     bSig = extractFloatx80Frac( b );
3689     bExp = extractFloatx80Exp( b );
3690     expDiff = aExp - bExp;
3691     if ( 0 < expDiff ) goto aExpBigger;
3692     if ( expDiff < 0 ) goto bExpBigger;
3693     if ( aExp == 0x7FFF ) {
3694         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3695             return propagateFloatx80NaN( a, b );
3696         }
3697         float_raise( float_flag_invalid );
3698         z.low = floatx80_default_nan_low;
3699         z.high = floatx80_default_nan_high;
3700         return z;
3701     }
3702     if ( aExp == 0 ) {
3703         aExp = 1;
3704         bExp = 1;
3705     }
3706     zSig1 = 0;
3707     if ( bSig < aSig ) goto aBigger;
3708     if ( aSig < bSig ) goto bBigger;
3709     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3710  bExpBigger:
3711     if ( bExp == 0x7FFF ) {
3712         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3713         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3714     }
3715     if ( aExp == 0 ) ++expDiff;
3716     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3717  bBigger:
3718     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3719     zExp = bExp;
3720     zSign ^= 1;
3721     goto normalizeRoundAndPack;
3722  aExpBigger:
3723     if ( aExp == 0x7FFF ) {
3724         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3725         return a;
3726     }
3727     if ( bExp == 0 ) --expDiff;
3728     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3729  aBigger:
3730     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3731     zExp = aExp;
3732  normalizeRoundAndPack:
3733     return
3734         normalizeRoundAndPackFloatx80(
3735             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3736 
3737 }
3738 
3739 /*
3740 -------------------------------------------------------------------------------
3741 Returns the result of adding the extended double-precision floating-point
3742 values `a' and `b'.  The operation is performed according to the IEC/IEEE
3743 Standard for Binary Floating-Point Arithmetic.
3744 -------------------------------------------------------------------------------
3745 */
3746 floatx80 floatx80_add( floatx80 a, floatx80 b )
3747 {
3748     flag aSign, bSign;
3749 
3750     aSign = extractFloatx80Sign( a );
3751     bSign = extractFloatx80Sign( b );
3752     if ( aSign == bSign ) {
3753         return addFloatx80Sigs( a, b, aSign );
3754     }
3755     else {
3756         return subFloatx80Sigs( a, b, aSign );
3757     }
3758 
3759 }
3760 
3761 /*
3762 -------------------------------------------------------------------------------
3763 Returns the result of subtracting the extended double-precision floating-
3764 point values `a' and `b'.  The operation is performed according to the
3765 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3766 -------------------------------------------------------------------------------
3767 */
3768 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3769 {
3770     flag aSign, bSign;
3771 
3772     aSign = extractFloatx80Sign( a );
3773     bSign = extractFloatx80Sign( b );
3774     if ( aSign == bSign ) {
3775         return subFloatx80Sigs( a, b, aSign );
3776     }
3777     else {
3778         return addFloatx80Sigs( a, b, aSign );
3779     }
3780 
3781 }
3782 
3783 /*
3784 -------------------------------------------------------------------------------
3785 Returns the result of multiplying the extended double-precision floating-
3786 point values `a' and `b'.  The operation is performed according to the
3787 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3788 -------------------------------------------------------------------------------
3789 */
3790 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3791 {
3792     flag aSign, bSign, zSign;
3793     int32 aExp, bExp, zExp;
3794     bits64 aSig, bSig, zSig0, zSig1;
3795     floatx80 z;
3796 
3797     aSig = extractFloatx80Frac( a );
3798     aExp = extractFloatx80Exp( a );
3799     aSign = extractFloatx80Sign( a );
3800     bSig = extractFloatx80Frac( b );
3801     bExp = extractFloatx80Exp( b );
3802     bSign = extractFloatx80Sign( b );
3803     zSign = aSign ^ bSign;
3804     if ( aExp == 0x7FFF ) {
3805         if (    (bits64) ( aSig<<1 )
3806              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3807             return propagateFloatx80NaN( a, b );
3808         }
3809         if ( ( bExp | bSig ) == 0 ) goto invalid;
3810         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3811     }
3812     if ( bExp == 0x7FFF ) {
3813         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3814         if ( ( aExp | aSig ) == 0 ) {
3815  invalid:
3816             float_raise( float_flag_invalid );
3817             z.low = floatx80_default_nan_low;
3818             z.high = floatx80_default_nan_high;
3819             return z;
3820         }
3821         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3822     }
3823     if ( aExp == 0 ) {
3824         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3825         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3826     }
3827     if ( bExp == 0 ) {
3828         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3829         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3830     }
3831     zExp = aExp + bExp - 0x3FFE;
3832     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3833     if ( 0 < (sbits64) zSig0 ) {
3834         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3835         --zExp;
3836     }
3837     return
3838         roundAndPackFloatx80(
3839             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3840 
3841 }
3842 
3843 /*
3844 -------------------------------------------------------------------------------
3845 Returns the result of dividing the extended double-precision floating-point
3846 value `a' by the corresponding value `b'.  The operation is performed
3847 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3848 -------------------------------------------------------------------------------
3849 */
3850 floatx80 floatx80_div( floatx80 a, floatx80 b )
3851 {
3852     flag aSign, bSign, zSign;
3853     int32 aExp, bExp, zExp;
3854     bits64 aSig, bSig, zSig0, zSig1;
3855     bits64 rem0, rem1, rem2, term0, term1, term2;
3856     floatx80 z;
3857 
3858     aSig = extractFloatx80Frac( a );
3859     aExp = extractFloatx80Exp( a );
3860     aSign = extractFloatx80Sign( a );
3861     bSig = extractFloatx80Frac( b );
3862     bExp = extractFloatx80Exp( b );
3863     bSign = extractFloatx80Sign( b );
3864     zSign = aSign ^ bSign;
3865     if ( aExp == 0x7FFF ) {
3866         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3867         if ( bExp == 0x7FFF ) {
3868             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3869             goto invalid;
3870         }
3871         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3872     }
3873     if ( bExp == 0x7FFF ) {
3874         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3875         return packFloatx80( zSign, 0, 0 );
3876     }
3877     if ( bExp == 0 ) {
3878         if ( bSig == 0 ) {
3879             if ( ( aExp | aSig ) == 0 ) {
3880  invalid:
3881                 float_raise( float_flag_invalid );
3882                 z.low = floatx80_default_nan_low;
3883                 z.high = floatx80_default_nan_high;
3884                 return z;
3885             }
3886             float_raise( float_flag_divbyzero );
3887             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3888         }
3889         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3890     }
3891     if ( aExp == 0 ) {
3892         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3893         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3894     }
3895     zExp = aExp - bExp + 0x3FFE;
3896     rem1 = 0;
3897     if ( bSig <= aSig ) {
3898         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3899         ++zExp;
3900     }
3901     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3902     mul64To128( bSig, zSig0, &term0, &term1 );
3903     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3904     while ( (sbits64) rem0 < 0 ) {
3905         --zSig0;
3906         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3907     }
3908     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3909     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3910         mul64To128( bSig, zSig1, &term1, &term2 );
3911         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3912         while ( (sbits64) rem1 < 0 ) {
3913             --zSig1;
3914             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3915         }
3916         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3917     }
3918     return
3919         roundAndPackFloatx80(
3920             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3921 
3922 }
3923 
3924 /*
3925 -------------------------------------------------------------------------------
3926 Returns the remainder of the extended double-precision floating-point value
3927 `a' with respect to the corresponding value `b'.  The operation is performed
3928 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3929 -------------------------------------------------------------------------------
3930 */
3931 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3932 {
3933     flag aSign, bSign, zSign;
3934     int32 aExp, bExp, expDiff;
3935     bits64 aSig0, aSig1, bSig;
3936     bits64 q, term0, term1, alternateASig0, alternateASig1;
3937     floatx80 z;
3938 
3939     aSig0 = extractFloatx80Frac( a );
3940     aExp = extractFloatx80Exp( a );
3941     aSign = extractFloatx80Sign( a );
3942     bSig = extractFloatx80Frac( b );
3943     bExp = extractFloatx80Exp( b );
3944     bSign = extractFloatx80Sign( b );
3945     if ( aExp == 0x7FFF ) {
3946         if (    (bits64) ( aSig0<<1 )
3947              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3948             return propagateFloatx80NaN( a, b );
3949         }
3950         goto invalid;
3951     }
3952     if ( bExp == 0x7FFF ) {
3953         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3954         return a;
3955     }
3956     if ( bExp == 0 ) {
3957         if ( bSig == 0 ) {
3958  invalid:
3959             float_raise( float_flag_invalid );
3960             z.low = floatx80_default_nan_low;
3961             z.high = floatx80_default_nan_high;
3962             return z;
3963         }
3964         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3965     }
3966     if ( aExp == 0 ) {
3967         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3968         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3969     }
3970     bSig |= LIT64( 0x8000000000000000 );
3971     zSign = aSign;
3972     expDiff = aExp - bExp;
3973     aSig1 = 0;
3974     if ( expDiff < 0 ) {
3975         if ( expDiff < -1 ) return a;
3976         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3977         expDiff = 0;
3978     }
3979     q = ( bSig <= aSig0 );
3980     if ( q ) aSig0 -= bSig;
3981     expDiff -= 64;
3982     while ( 0 < expDiff ) {
3983         q = estimateDiv128To64( aSig0, aSig1, bSig );
3984         q = ( 2 < q ) ? q - 2 : 0;
3985         mul64To128( bSig, q, &term0, &term1 );
3986         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3987         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3988         expDiff -= 62;
3989     }
3990     expDiff += 64;
3991     if ( 0 < expDiff ) {
3992         q = estimateDiv128To64( aSig0, aSig1, bSig );
3993         q = ( 2 < q ) ? q - 2 : 0;
3994         q >>= 64 - expDiff;
3995         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3996         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3997         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3998         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3999             ++q;
4000             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4001         }
4002     }
4003     else {
4004         term1 = 0;
4005         term0 = bSig;
4006     }
4007     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4008     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4009          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4010               && ( q & 1 ) )
4011        ) {
4012         aSig0 = alternateASig0;
4013         aSig1 = alternateASig1;
4014         zSign = ! zSign;
4015     }
4016     return
4017         normalizeRoundAndPackFloatx80(
4018             80, zSign, bExp + expDiff, aSig0, aSig1 );
4019 
4020 }
4021 
4022 /*
4023 -------------------------------------------------------------------------------
4024 Returns the square root of the extended double-precision floating-point
4025 value `a'.  The operation is performed according to the IEC/IEEE Standard
4026 for Binary Floating-Point Arithmetic.
4027 -------------------------------------------------------------------------------
4028 */
4029 floatx80 floatx80_sqrt( floatx80 a )
4030 {
4031     flag aSign;
4032     int32 aExp, zExp;
4033     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4034     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4035     floatx80 z;
4036 
4037     aSig0 = extractFloatx80Frac( a );
4038     aExp = extractFloatx80Exp( a );
4039     aSign = extractFloatx80Sign( a );
4040     if ( aExp == 0x7FFF ) {
4041         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4042         if ( ! aSign ) return a;
4043         goto invalid;
4044     }
4045     if ( aSign ) {
4046         if ( ( aExp | aSig0 ) == 0 ) return a;
4047  invalid:
4048         float_raise( float_flag_invalid );
4049         z.low = floatx80_default_nan_low;
4050         z.high = floatx80_default_nan_high;
4051         return z;
4052     }
4053     if ( aExp == 0 ) {
4054         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4055         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4056     }
4057     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4058     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4059     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4060     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4061     doubleZSig0 = zSig0<<1;
4062     mul64To128( zSig0, zSig0, &term0, &term1 );
4063     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4064     while ( (sbits64) rem0 < 0 ) {
4065         --zSig0;
4066         doubleZSig0 -= 2;
4067         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4068     }
4069     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4070     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4071         if ( zSig1 == 0 ) zSig1 = 1;
4072         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4073         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4074         mul64To128( zSig1, zSig1, &term2, &term3 );
4075         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4076         while ( (sbits64) rem1 < 0 ) {
4077             --zSig1;
4078             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4079             term3 |= 1;
4080             term2 |= doubleZSig0;
4081             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4082         }
4083         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4084     }
4085     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4086     zSig0 |= doubleZSig0;
4087     return
4088         roundAndPackFloatx80(
4089             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4090 
4091 }
4092 
4093 /*
4094 -------------------------------------------------------------------------------
4095 Returns 1 if the extended double-precision floating-point value `a' is
4096 equal to the corresponding value `b', and 0 otherwise.  The comparison is
4097 performed according to the IEC/IEEE Standard for Binary Floating-Point
4098 Arithmetic.
4099 -------------------------------------------------------------------------------
4100 */
4101 flag floatx80_eq( floatx80 a, floatx80 b )
4102 {
4103 
4104     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4105               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4106          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4107               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4108        ) {
4109         if (    floatx80_is_signaling_nan( a )
4110              || floatx80_is_signaling_nan( b ) ) {
4111             float_raise( float_flag_invalid );
4112         }
4113         return 0;
4114     }
4115     return
4116            ( a.low == b.low )
4117         && (    ( a.high == b.high )
4118              || (    ( a.low == 0 )
4119                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4120            );
4121 
4122 }
4123 
4124 /*
4125 -------------------------------------------------------------------------------
4126 Returns 1 if the extended double-precision floating-point value `a' is
4127 less than or equal to the corresponding value `b', and 0 otherwise.  The
4128 comparison is performed according to the IEC/IEEE Standard for Binary
4129 Floating-Point Arithmetic.
4130 -------------------------------------------------------------------------------
4131 */
4132 flag floatx80_le( floatx80 a, floatx80 b )
4133 {
4134     flag aSign, bSign;
4135 
4136     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4137               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4138          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4139               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4140        ) {
4141         float_raise( float_flag_invalid );
4142         return 0;
4143     }
4144     aSign = extractFloatx80Sign( a );
4145     bSign = extractFloatx80Sign( b );
4146     if ( aSign != bSign ) {
4147         return
4148                aSign
4149             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4150                  == 0 );
4151     }
4152     return
4153           aSign ? le128( b.high, b.low, a.high, a.low )
4154         : le128( a.high, a.low, b.high, b.low );
4155 
4156 }
4157 
4158 /*
4159 -------------------------------------------------------------------------------
4160 Returns 1 if the extended double-precision floating-point value `a' is
4161 less than the corresponding value `b', and 0 otherwise.  The comparison
4162 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4163 Arithmetic.
4164 -------------------------------------------------------------------------------
4165 */
4166 flag floatx80_lt( floatx80 a, floatx80 b )
4167 {
4168     flag aSign, bSign;
4169 
4170     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4171               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4172          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4173               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4174        ) {
4175         float_raise( float_flag_invalid );
4176         return 0;
4177     }
4178     aSign = extractFloatx80Sign( a );
4179     bSign = extractFloatx80Sign( b );
4180     if ( aSign != bSign ) {
4181         return
4182                aSign
4183             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4184                  != 0 );
4185     }
4186     return
4187           aSign ? lt128( b.high, b.low, a.high, a.low )
4188         : lt128( a.high, a.low, b.high, b.low );
4189 
4190 }
4191 
4192 /*
4193 -------------------------------------------------------------------------------
4194 Returns 1 if the extended double-precision floating-point value `a' is equal
4195 to the corresponding value `b', and 0 otherwise.  The invalid exception is
4196 raised if either operand is a NaN.  Otherwise, the comparison is performed
4197 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4198 -------------------------------------------------------------------------------
4199 */
4200 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4201 {
4202 
4203     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4204               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4205          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4206               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4207        ) {
4208         float_raise( float_flag_invalid );
4209         return 0;
4210     }
4211     return
4212            ( a.low == b.low )
4213         && (    ( a.high == b.high )
4214              || (    ( a.low == 0 )
4215                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4216            );
4217 
4218 }
4219 
4220 /*
4221 -------------------------------------------------------------------------------
4222 Returns 1 if the extended double-precision floating-point value `a' is less
4223 than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4224 do not cause an exception.  Otherwise, the comparison is performed according
4225 to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4226 -------------------------------------------------------------------------------
4227 */
4228 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4229 {
4230     flag aSign, bSign;
4231 
4232     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4233               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4234          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4235               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4236        ) {
4237         if (    floatx80_is_signaling_nan( a )
4238              || floatx80_is_signaling_nan( b ) ) {
4239             float_raise( float_flag_invalid );
4240         }
4241         return 0;
4242     }
4243     aSign = extractFloatx80Sign( a );
4244     bSign = extractFloatx80Sign( b );
4245     if ( aSign != bSign ) {
4246         return
4247                aSign
4248             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4249                  == 0 );
4250     }
4251     return
4252           aSign ? le128( b.high, b.low, a.high, a.low )
4253         : le128( a.high, a.low, b.high, b.low );
4254 
4255 }
4256 
4257 /*
4258 -------------------------------------------------------------------------------
4259 Returns 1 if the extended double-precision floating-point value `a' is less
4260 than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4261 an exception.  Otherwise, the comparison is performed according to the
4262 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4263 -------------------------------------------------------------------------------
4264 */
4265 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4266 {
4267     flag aSign, bSign;
4268 
4269     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4270               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4271          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4272               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4273        ) {
4274         if (    floatx80_is_signaling_nan( a )
4275              || floatx80_is_signaling_nan( b ) ) {
4276             float_raise( float_flag_invalid );
4277         }
4278         return 0;
4279     }
4280     aSign = extractFloatx80Sign( a );
4281     bSign = extractFloatx80Sign( b );
4282     if ( aSign != bSign ) {
4283         return
4284                aSign
4285             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4286                  != 0 );
4287     }
4288     return
4289           aSign ? lt128( b.high, b.low, a.high, a.low )
4290         : lt128( a.high, a.low, b.high, b.low );
4291 
4292 }
4293 
4294 #endif
4295 
4296 #ifdef FLOAT128
4297 
4298 /*
4299 -------------------------------------------------------------------------------
4300 Returns the result of converting the quadruple-precision floating-point
4301 value `a' to the 32-bit two's complement integer format.  The conversion
4302 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4303 Arithmetic---which means in particular that the conversion is rounded
4304 according to the current rounding mode.  If `a' is a NaN, the largest
4305 positive integer is returned.  Otherwise, if the conversion overflows, the
4306 largest integer with the same sign as `a' is returned.
4307 -------------------------------------------------------------------------------
4308 */
4309 int32 float128_to_int32( float128 a )
4310 {
4311     flag aSign;
4312     int32 aExp, shiftCount;
4313     bits64 aSig0, aSig1;
4314 
4315     aSig1 = extractFloat128Frac1( a );
4316     aSig0 = extractFloat128Frac0( a );
4317     aExp = extractFloat128Exp( a );
4318     aSign = extractFloat128Sign( a );
4319     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4320     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4321     aSig0 |= ( aSig1 != 0 );
4322     shiftCount = 0x4028 - aExp;
4323     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4324     return roundAndPackInt32( aSign, aSig0 );
4325 
4326 }
4327 
4328 /*
4329 -------------------------------------------------------------------------------
4330 Returns the result of converting the quadruple-precision floating-point
4331 value `a' to the 32-bit two's complement integer format.  The conversion
4332 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4333 Arithmetic, except that the conversion is always rounded toward zero.  If
4334 `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4335 conversion overflows, the largest integer with the same sign as `a' is
4336 returned.
4337 -------------------------------------------------------------------------------
4338 */
4339 int32 float128_to_int32_round_to_zero( float128 a )
4340 {
4341     flag aSign;
4342     int32 aExp, shiftCount;
4343     bits64 aSig0, aSig1, savedASig;
4344     int32 z;
4345 
4346     aSig1 = extractFloat128Frac1( a );
4347     aSig0 = extractFloat128Frac0( a );
4348     aExp = extractFloat128Exp( a );
4349     aSign = extractFloat128Sign( a );
4350     aSig0 |= ( aSig1 != 0 );
4351     if ( 0x401E < aExp ) {
4352         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4353         goto invalid;
4354     }
4355     else if ( aExp < 0x3FFF ) {
4356         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4357         return 0;
4358     }
4359     aSig0 |= LIT64( 0x0001000000000000 );
4360     shiftCount = 0x402F - aExp;
4361     savedASig = aSig0;
4362     aSig0 >>= shiftCount;
4363     z = aSig0;
4364     if ( aSign ) z = - z;
4365     if ( ( z < 0 ) ^ aSign ) {
4366  invalid:
4367         float_raise( float_flag_invalid );
4368         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4369     }
4370     if ( ( aSig0<<shiftCount ) != savedASig ) {
4371         float_exception_flags |= float_flag_inexact;
4372     }
4373     return z;
4374 
4375 }
4376 
4377 /*
4378 -------------------------------------------------------------------------------
4379 Returns the result of converting the quadruple-precision floating-point
4380 value `a' to the 64-bit two's complement integer format.  The conversion
4381 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4382 Arithmetic---which means in particular that the conversion is rounded
4383 according to the current rounding mode.  If `a' is a NaN, the largest
4384 positive integer is returned.  Otherwise, if the conversion overflows, the
4385 largest integer with the same sign as `a' is returned.
4386 -------------------------------------------------------------------------------
4387 */
4388 int64 float128_to_int64( float128 a )
4389 {
4390     flag aSign;
4391     int32 aExp, shiftCount;
4392     bits64 aSig0, aSig1;
4393 
4394     aSig1 = extractFloat128Frac1( a );
4395     aSig0 = extractFloat128Frac0( a );
4396     aExp = extractFloat128Exp( a );
4397     aSign = extractFloat128Sign( a );
4398     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4399     shiftCount = 0x402F - aExp;
4400     if ( shiftCount <= 0 ) {
4401         if ( 0x403E < aExp ) {
4402             float_raise( float_flag_invalid );
4403             if (    ! aSign
4404                  || (    ( aExp == 0x7FFF )
4405                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4406                     )
4407                ) {
4408                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4409             }
4410             return (sbits64) LIT64( 0x8000000000000000 );
4411         }
4412         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4413     }
4414     else {
4415         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4416     }
4417     return roundAndPackInt64( aSign, aSig0, aSig1 );
4418 
4419 }
4420 
4421 /*
4422 -------------------------------------------------------------------------------
4423 Returns the result of converting the quadruple-precision floating-point
4424 value `a' to the 64-bit two's complement integer format.  The conversion
4425 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4426 Arithmetic, except that the conversion is always rounded toward zero.
4427 If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4428 the conversion overflows, the largest integer with the same sign as `a' is
4429 returned.
4430 -------------------------------------------------------------------------------
4431 */
4432 int64 float128_to_int64_round_to_zero( float128 a )
4433 {
4434     flag aSign;
4435     int32 aExp, shiftCount;
4436     bits64 aSig0, aSig1;
4437     int64 z;
4438 
4439     aSig1 = extractFloat128Frac1( a );
4440     aSig0 = extractFloat128Frac0( a );
4441     aExp = extractFloat128Exp( a );
4442     aSign = extractFloat128Sign( a );
4443     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4444     shiftCount = aExp - 0x402F;
4445     if ( 0 < shiftCount ) {
4446         if ( 0x403E <= aExp ) {
4447             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4448             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4449                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4450                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4451             }
4452             else {
4453                 float_raise( float_flag_invalid );
4454                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4455                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4456                 }
4457             }
4458             return (sbits64) LIT64( 0x8000000000000000 );
4459         }
4460         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4461         if ( (bits64) ( aSig1<<shiftCount ) ) {
4462             float_exception_flags |= float_flag_inexact;
4463         }
4464     }
4465     else {
4466         if ( aExp < 0x3FFF ) {
4467             if ( aExp | aSig0 | aSig1 ) {
4468                 float_exception_flags |= float_flag_inexact;
4469             }
4470             return 0;
4471         }
4472         z = aSig0>>( - shiftCount );
4473         if (    aSig1
4474              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4475             float_exception_flags |= float_flag_inexact;
4476         }
4477     }
4478     if ( aSign ) z = - z;
4479     return z;
4480 
4481 }
4482 
4483 #if (defined(SOFTFLOATSPARC64_FOR_GCC) || defined(SOFTFLOAT_FOR_GCC)) \
4484     && defined(SOFTFLOAT_NEED_FIXUNS)
4485 /*
4486  * just like above - but do not care for overflow of signed results
4487  */
4488 uint64 float128_to_uint64_round_to_zero( float128 a )
4489 {
4490     flag aSign;
4491     int32 aExp, shiftCount;
4492     bits64 aSig0, aSig1;
4493     uint64 z;
4494 
4495     aSig1 = extractFloat128Frac1( a );
4496     aSig0 = extractFloat128Frac0( a );
4497     aExp = extractFloat128Exp( a );
4498     aSign = extractFloat128Sign( a );
4499     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4500     shiftCount = aExp - 0x402F;
4501     if ( 0 < shiftCount ) {
4502         if ( 0x403F <= aExp ) {
4503             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4504             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4505                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4506                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4507             }
4508             else {
4509                 float_raise( float_flag_invalid );
4510             }
4511             return LIT64( 0xFFFFFFFFFFFFFFFF );
4512         }
4513         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4514         if ( (bits64) ( aSig1<<shiftCount ) ) {
4515             float_exception_flags |= float_flag_inexact;
4516         }
4517     }
4518     else {
4519         if ( aExp < 0x3FFF ) {
4520             if ( aExp | aSig0 | aSig1 ) {
4521                 float_exception_flags |= float_flag_inexact;
4522             }
4523             return 0;
4524         }
4525         z = aSig0>>( - shiftCount );
4526         if (aSig1 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4527             float_exception_flags |= float_flag_inexact;
4528         }
4529     }
4530     if ( aSign ) z = - z;
4531     return z;
4532 
4533 }
4534 #endif /* (SOFTFLOATSPARC64_FOR_GCC || SOFTFLOAT_FOR_GCC) && SOFTFLOAT_NEED_FIXUNS */
4535 
4536 /*
4537 -------------------------------------------------------------------------------
4538 Returns the result of converting the quadruple-precision floating-point
4539 value `a' to the single-precision floating-point format.  The conversion
4540 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4541 Arithmetic.
4542 -------------------------------------------------------------------------------
4543 */
4544 float32 float128_to_float32( float128 a )
4545 {
4546     flag aSign;
4547     int32 aExp;
4548     bits64 aSig0, aSig1;
4549     bits32 zSig;
4550 
4551     aSig1 = extractFloat128Frac1( a );
4552     aSig0 = extractFloat128Frac0( a );
4553     aExp = extractFloat128Exp( a );
4554     aSign = extractFloat128Sign( a );
4555     if ( aExp == 0x7FFF ) {
4556         if ( aSig0 | aSig1 ) {
4557             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4558         }
4559         return packFloat32( aSign, 0xFF, 0 );
4560     }
4561     aSig0 |= ( aSig1 != 0 );
4562     shift64RightJamming( aSig0, 18, &aSig0 );
4563     zSig = aSig0;
4564     if ( aExp || zSig ) {
4565         zSig |= 0x40000000;
4566         aExp -= 0x3F81;
4567     }
4568     return roundAndPackFloat32( aSign, aExp, zSig );
4569 
4570 }
4571 
4572 /*
4573 -------------------------------------------------------------------------------
4574 Returns the result of converting the quadruple-precision floating-point
4575 value `a' to the double-precision floating-point format.  The conversion
4576 is performed according to the IEC/IEEE Standard for Binary Floating-Point
4577 Arithmetic.
4578 -------------------------------------------------------------------------------
4579 */
4580 float64 float128_to_float64( float128 a )
4581 {
4582     flag aSign;
4583     int32 aExp;
4584     bits64 aSig0, aSig1;
4585 
4586     aSig1 = extractFloat128Frac1( a );
4587     aSig0 = extractFloat128Frac0( a );
4588     aExp = extractFloat128Exp( a );
4589     aSign = extractFloat128Sign( a );
4590     if ( aExp == 0x7FFF ) {
4591         if ( aSig0 | aSig1 ) {
4592             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4593         }
4594         return packFloat64( aSign, 0x7FF, 0 );
4595     }
4596     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4597     aSig0 |= ( aSig1 != 0 );
4598     if ( aExp || aSig0 ) {
4599         aSig0 |= LIT64( 0x4000000000000000 );
4600         aExp -= 0x3C01;
4601     }
4602     return roundAndPackFloat64( aSign, aExp, aSig0 );
4603 
4604 }
4605 
4606 #ifdef FLOATX80
4607 
4608 /*
4609 -------------------------------------------------------------------------------
4610 Returns the result of converting the quadruple-precision floating-point
4611 value `a' to the extended double-precision floating-point format.  The
4612 conversion is performed according to the IEC/IEEE Standard for Binary
4613 Floating-Point Arithmetic.
4614 -------------------------------------------------------------------------------
4615 */
4616 floatx80 float128_to_floatx80( float128 a )
4617 {
4618     flag aSign;
4619     int32 aExp;
4620     bits64 aSig0, aSig1;
4621 
4622     aSig1 = extractFloat128Frac1( a );
4623     aSig0 = extractFloat128Frac0( a );
4624     aExp = extractFloat128Exp( a );
4625     aSign = extractFloat128Sign( a );
4626     if ( aExp == 0x7FFF ) {
4627         if ( aSig0 | aSig1 ) {
4628             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4629         }
4630         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4631     }
4632     if ( aExp == 0 ) {
4633         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4634         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4635     }
4636     else {
4637         aSig0 |= LIT64( 0x0001000000000000 );
4638     }
4639     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4640     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4641 
4642 }
4643 
4644 #endif
4645 
4646 /*
4647 -------------------------------------------------------------------------------
4648 Rounds the quadruple-precision floating-point value `a' to an integer, and
4649 returns the result as a quadruple-precision floating-point value.  The
4650 operation is performed according to the IEC/IEEE Standard for Binary
4651 Floating-Point Arithmetic.
4652 -------------------------------------------------------------------------------
4653 */
4654 float128 float128_round_to_int( float128 a )
4655 {
4656     flag aSign;
4657     int32 aExp;
4658     bits64 lastBitMask, roundBitsMask;
4659     int8 roundingMode;
4660     float128 z;
4661 
4662     aExp = extractFloat128Exp( a );
4663     if ( 0x402F <= aExp ) {
4664         if ( 0x406F <= aExp ) {
4665             if (    ( aExp == 0x7FFF )
4666                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4667                ) {
4668                 return propagateFloat128NaN( a, a );
4669             }
4670             return a;
4671         }
4672         lastBitMask = 1;
4673         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4674         roundBitsMask = lastBitMask - 1;
4675         z = a;
4676         roundingMode = float_rounding_mode;
4677         if ( roundingMode == float_round_nearest_even ) {
4678             if ( lastBitMask ) {
4679                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4680                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4681             }
4682             else {
4683                 if ( (sbits64) z.low < 0 ) {
4684                     ++z.high;
4685                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4686                 }
4687             }
4688         }
4689         else if ( roundingMode != float_round_to_zero ) {
4690             if (   extractFloat128Sign( z )
4691                  ^ ( roundingMode == float_round_up ) ) {
4692                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4693             }
4694         }
4695         z.low &= ~ roundBitsMask;
4696     }
4697     else {
4698         if ( aExp < 0x3FFF ) {
4699             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4700             float_exception_flags |= float_flag_inexact;
4701             aSign = extractFloat128Sign( a );
4702             switch ( float_rounding_mode ) {
4703              case float_round_nearest_even:
4704                 if (    ( aExp == 0x3FFE )
4705                      && (   extractFloat128Frac0( a )
4706                           | extractFloat128Frac1( a ) )
4707                    ) {
4708                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4709                 }
4710                 break;
4711 	     case float_round_to_zero:
4712 		break;
4713              case float_round_down:
4714                 return
4715                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4716                     : packFloat128( 0, 0, 0, 0 );
4717              case float_round_up:
4718                 return
4719                       aSign ? packFloat128( 1, 0, 0, 0 )
4720                     : packFloat128( 0, 0x3FFF, 0, 0 );
4721             }
4722             return packFloat128( aSign, 0, 0, 0 );
4723         }
4724         lastBitMask = 1;
4725         lastBitMask <<= 0x402F - aExp;
4726         roundBitsMask = lastBitMask - 1;
4727         z.low = 0;
4728         z.high = a.high;
4729         roundingMode = float_rounding_mode;
4730         if ( roundingMode == float_round_nearest_even ) {
4731             z.high += lastBitMask>>1;
4732             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4733                 z.high &= ~ lastBitMask;
4734             }
4735         }
4736         else if ( roundingMode != float_round_to_zero ) {
4737             if (   extractFloat128Sign( z )
4738                  ^ ( roundingMode == float_round_up ) ) {
4739                 z.high |= ( a.low != 0 );
4740                 z.high += roundBitsMask;
4741             }
4742         }
4743         z.high &= ~ roundBitsMask;
4744     }
4745     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4746         float_exception_flags |= float_flag_inexact;
4747     }
4748     return z;
4749 
4750 }
4751 
4752 /*
4753 -------------------------------------------------------------------------------
4754 Returns the result of adding the absolute values of the quadruple-precision
4755 floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4756 before being returned.  `zSign' is ignored if the result is a NaN.
4757 The addition is performed according to the IEC/IEEE Standard for Binary
4758 Floating-Point Arithmetic.
4759 -------------------------------------------------------------------------------
4760 */
4761 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4762 {
4763     int32 aExp, bExp, zExp;
4764     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4765     int32 expDiff;
4766 
4767     aSig1 = extractFloat128Frac1( a );
4768     aSig0 = extractFloat128Frac0( a );
4769     aExp = extractFloat128Exp( a );
4770     bSig1 = extractFloat128Frac1( b );
4771     bSig0 = extractFloat128Frac0( b );
4772     bExp = extractFloat128Exp( b );
4773     expDiff = aExp - bExp;
4774     if ( 0 < expDiff ) {
4775         if ( aExp == 0x7FFF ) {
4776             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4777             return a;
4778         }
4779         if ( bExp == 0 ) {
4780             --expDiff;
4781         }
4782         else {
4783             bSig0 |= LIT64( 0x0001000000000000 );
4784         }
4785         shift128ExtraRightJamming(
4786             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4787         zExp = aExp;
4788     }
4789     else if ( expDiff < 0 ) {
4790         if ( bExp == 0x7FFF ) {
4791             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4792             return packFloat128( zSign, 0x7FFF, 0, 0 );
4793         }
4794         if ( aExp == 0 ) {
4795             ++expDiff;
4796         }
4797         else {
4798             aSig0 |= LIT64( 0x0001000000000000 );
4799         }
4800         shift128ExtraRightJamming(
4801             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4802         zExp = bExp;
4803     }
4804     else {
4805         if ( aExp == 0x7FFF ) {
4806             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4807                 return propagateFloat128NaN( a, b );
4808             }
4809             return a;
4810         }
4811         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4812         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4813         zSig2 = 0;
4814         zSig0 |= LIT64( 0x0002000000000000 );
4815         zExp = aExp;
4816         goto shiftRight1;
4817     }
4818     aSig0 |= LIT64( 0x0001000000000000 );
4819     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4820     --zExp;
4821     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4822     ++zExp;
4823  shiftRight1:
4824     shift128ExtraRightJamming(
4825         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4826  roundAndPack:
4827     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4828 
4829 }
4830 
4831 /*
4832 -------------------------------------------------------------------------------
4833 Returns the result of subtracting the absolute values of the quadruple-
4834 precision floating-point values `a' and `b'.  If `zSign' is 1, the
4835 difference is negated before being returned.  `zSign' is ignored if the
4836 result is a NaN.  The subtraction is performed according to the IEC/IEEE
4837 Standard for Binary Floating-Point Arithmetic.
4838 -------------------------------------------------------------------------------
4839 */
4840 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4841 {
4842     int32 aExp, bExp, zExp;
4843     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4844     int32 expDiff;
4845     float128 z;
4846 
4847     aSig1 = extractFloat128Frac1( a );
4848     aSig0 = extractFloat128Frac0( a );
4849     aExp = extractFloat128Exp( a );
4850     bSig1 = extractFloat128Frac1( b );
4851     bSig0 = extractFloat128Frac0( b );
4852     bExp = extractFloat128Exp( b );
4853     expDiff = aExp - bExp;
4854     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4855     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4856     if ( 0 < expDiff ) goto aExpBigger;
4857     if ( expDiff < 0 ) goto bExpBigger;
4858     if ( aExp == 0x7FFF ) {
4859         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4860             return propagateFloat128NaN( a, b );
4861         }
4862         float_raise( float_flag_invalid );
4863         z.low = float128_default_nan_low;
4864         z.high = float128_default_nan_high;
4865         return z;
4866     }
4867     if ( aExp == 0 ) {
4868         aExp = 1;
4869         bExp = 1;
4870     }
4871     if ( bSig0 < aSig0 ) goto aBigger;
4872     if ( aSig0 < bSig0 ) goto bBigger;
4873     if ( bSig1 < aSig1 ) goto aBigger;
4874     if ( aSig1 < bSig1 ) goto bBigger;
4875     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4876  bExpBigger:
4877     if ( bExp == 0x7FFF ) {
4878         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4879         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4880     }
4881     if ( aExp == 0 ) {
4882         ++expDiff;
4883     }
4884     else {
4885         aSig0 |= LIT64( 0x4000000000000000 );
4886     }
4887     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4888     bSig0 |= LIT64( 0x4000000000000000 );
4889  bBigger:
4890     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4891     zExp = bExp;
4892     zSign ^= 1;
4893     goto normalizeRoundAndPack;
4894  aExpBigger:
4895     if ( aExp == 0x7FFF ) {
4896         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4897         return a;
4898     }
4899     if ( bExp == 0 ) {
4900         --expDiff;
4901     }
4902     else {
4903         bSig0 |= LIT64( 0x4000000000000000 );
4904     }
4905     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4906     aSig0 |= LIT64( 0x4000000000000000 );
4907  aBigger:
4908     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4909     zExp = aExp;
4910  normalizeRoundAndPack:
4911     --zExp;
4912     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4913 
4914 }
4915 
4916 /*
4917 -------------------------------------------------------------------------------
4918 Returns the result of adding the quadruple-precision floating-point values
4919 `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4920 for Binary Floating-Point Arithmetic.
4921 -------------------------------------------------------------------------------
4922 */
4923 float128 float128_add( float128 a, float128 b )
4924 {
4925     flag aSign, bSign;
4926 
4927     aSign = extractFloat128Sign( a );
4928     bSign = extractFloat128Sign( b );
4929     if ( aSign == bSign ) {
4930         return addFloat128Sigs( a, b, aSign );
4931     }
4932     else {
4933         return subFloat128Sigs( a, b, aSign );
4934     }
4935 
4936 }
4937 
4938 /*
4939 -------------------------------------------------------------------------------
4940 Returns the result of subtracting the quadruple-precision floating-point
4941 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4942 Standard for Binary Floating-Point Arithmetic.
4943 -------------------------------------------------------------------------------
4944 */
4945 float128 float128_sub( float128 a, float128 b )
4946 {
4947     flag aSign, bSign;
4948 
4949     aSign = extractFloat128Sign( a );
4950     bSign = extractFloat128Sign( b );
4951     if ( aSign == bSign ) {
4952         return subFloat128Sigs( a, b, aSign );
4953     }
4954     else {
4955         return addFloat128Sigs( a, b, aSign );
4956     }
4957 
4958 }
4959 
4960 /*
4961 -------------------------------------------------------------------------------
4962 Returns the result of multiplying the quadruple-precision floating-point
4963 values `a' and `b'.  The operation is performed according to the IEC/IEEE
4964 Standard for Binary Floating-Point Arithmetic.
4965 -------------------------------------------------------------------------------
4966 */
4967 float128 float128_mul( float128 a, float128 b )
4968 {
4969     flag aSign, bSign, zSign;
4970     int32 aExp, bExp, zExp;
4971     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4972     float128 z;
4973 
4974     aSig1 = extractFloat128Frac1( a );
4975     aSig0 = extractFloat128Frac0( a );
4976     aExp = extractFloat128Exp( a );
4977     aSign = extractFloat128Sign( a );
4978     bSig1 = extractFloat128Frac1( b );
4979     bSig0 = extractFloat128Frac0( b );
4980     bExp = extractFloat128Exp( b );
4981     bSign = extractFloat128Sign( b );
4982     zSign = aSign ^ bSign;
4983     if ( aExp == 0x7FFF ) {
4984         if (    ( aSig0 | aSig1 )
4985              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4986             return propagateFloat128NaN( a, b );
4987         }
4988         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4989         return packFloat128( zSign, 0x7FFF, 0, 0 );
4990     }
4991     if ( bExp == 0x7FFF ) {
4992         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4993         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4994  invalid:
4995             float_raise( float_flag_invalid );
4996             z.low = float128_default_nan_low;
4997             z.high = float128_default_nan_high;
4998             return z;
4999         }
5000         return packFloat128( zSign, 0x7FFF, 0, 0 );
5001     }
5002     if ( aExp == 0 ) {
5003         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5004         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5005     }
5006     if ( bExp == 0 ) {
5007         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5008         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5009     }
5010     zExp = aExp + bExp - 0x4000;
5011     aSig0 |= LIT64( 0x0001000000000000 );
5012     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5013     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5014     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5015     zSig2 |= ( zSig3 != 0 );
5016     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5017         shift128ExtraRightJamming(
5018             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5019         ++zExp;
5020     }
5021     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5022 
5023 }
5024 
5025 /*
5026 -------------------------------------------------------------------------------
5027 Returns the result of dividing the quadruple-precision floating-point value
5028 `a' by the corresponding value `b'.  The operation is performed according to
5029 the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5030 -------------------------------------------------------------------------------
5031 */
5032 float128 float128_div( float128 a, float128 b )
5033 {
5034     flag aSign, bSign, zSign;
5035     int32 aExp, bExp, zExp;
5036     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5037     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5038     float128 z;
5039 
5040     aSig1 = extractFloat128Frac1( a );
5041     aSig0 = extractFloat128Frac0( a );
5042     aExp = extractFloat128Exp( a );
5043     aSign = extractFloat128Sign( a );
5044     bSig1 = extractFloat128Frac1( b );
5045     bSig0 = extractFloat128Frac0( b );
5046     bExp = extractFloat128Exp( b );
5047     bSign = extractFloat128Sign( b );
5048     zSign = aSign ^ bSign;
5049     if ( aExp == 0x7FFF ) {
5050         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
5051         if ( bExp == 0x7FFF ) {
5052             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5053             goto invalid;
5054         }
5055         return packFloat128( zSign, 0x7FFF, 0, 0 );
5056     }
5057     if ( bExp == 0x7FFF ) {
5058         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5059         return packFloat128( zSign, 0, 0, 0 );
5060     }
5061     if ( bExp == 0 ) {
5062         if ( ( bSig0 | bSig1 ) == 0 ) {
5063             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5064  invalid:
5065                 float_raise( float_flag_invalid );
5066                 z.low = float128_default_nan_low;
5067                 z.high = float128_default_nan_high;
5068                 return z;
5069             }
5070             float_raise( float_flag_divbyzero );
5071             return packFloat128( zSign, 0x7FFF, 0, 0 );
5072         }
5073         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5074     }
5075     if ( aExp == 0 ) {
5076         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5077         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5078     }
5079     zExp = aExp - bExp + 0x3FFD;
5080     shortShift128Left(
5081         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5082     shortShift128Left(
5083         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5084     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5085         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5086         ++zExp;
5087     }
5088     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5089     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5090     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5091     while ( (sbits64) rem0 < 0 ) {
5092         --zSig0;
5093         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5094     }
5095     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5096     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5097         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5098         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5099         while ( (sbits64) rem1 < 0 ) {
5100             --zSig1;
5101             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5102         }
5103         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5104     }
5105     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5106     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5107 
5108 }
5109 
5110 /*
5111 -------------------------------------------------------------------------------
5112 Returns the remainder of the quadruple-precision floating-point value `a'
5113 with respect to the corresponding value `b'.  The operation is performed
5114 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5115 -------------------------------------------------------------------------------
5116 */
5117 float128 float128_rem( float128 a, float128 b )
5118 {
5119     flag aSign, bSign, zSign;
5120     int32 aExp, bExp, expDiff;
5121     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5122     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5123     sbits64 sigMean0;
5124     float128 z;
5125 
5126     aSig1 = extractFloat128Frac1( a );
5127     aSig0 = extractFloat128Frac0( a );
5128     aExp = extractFloat128Exp( a );
5129     aSign = extractFloat128Sign( a );
5130     bSig1 = extractFloat128Frac1( b );
5131     bSig0 = extractFloat128Frac0( b );
5132     bExp = extractFloat128Exp( b );
5133     bSign = extractFloat128Sign( b );
5134     if ( aExp == 0x7FFF ) {
5135         if (    ( aSig0 | aSig1 )
5136              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5137             return propagateFloat128NaN( a, b );
5138         }
5139         goto invalid;
5140     }
5141     if ( bExp == 0x7FFF ) {
5142         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5143         return a;
5144     }
5145     if ( bExp == 0 ) {
5146         if ( ( bSig0 | bSig1 ) == 0 ) {
5147  invalid:
5148             float_raise( float_flag_invalid );
5149             z.low = float128_default_nan_low;
5150             z.high = float128_default_nan_high;
5151             return z;
5152         }
5153         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5154     }
5155     if ( aExp == 0 ) {
5156         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5157         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5158     }
5159     expDiff = aExp - bExp;
5160     if ( expDiff < -1 ) return a;
5161     shortShift128Left(
5162         aSig0 | LIT64( 0x0001000000000000 ),
5163         aSig1,
5164         15 - ( expDiff < 0 ),
5165         &aSig0,
5166         &aSig1
5167     );
5168     shortShift128Left(
5169         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5170     q = le128( bSig0, bSig1, aSig0, aSig1 );
5171     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5172     expDiff -= 64;
5173     while ( 0 < expDiff ) {
5174         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5175         q = ( 4 < q ) ? q - 4 : 0;
5176         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5177         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5178         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5179         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5180         expDiff -= 61;
5181     }
5182     if ( -64 < expDiff ) {
5183         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5184         q = ( 4 < q ) ? q - 4 : 0;
5185         q >>= - expDiff;
5186         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5187         expDiff += 52;
5188         if ( expDiff < 0 ) {
5189             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5190         }
5191         else {
5192             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5193         }
5194         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5195         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5196     }
5197     else {
5198         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5199         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5200     }
5201     do {
5202         alternateASig0 = aSig0;
5203         alternateASig1 = aSig1;
5204         ++q;
5205         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5206     } while ( 0 <= (sbits64) aSig0 );
5207     add128(
5208         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5209     if (    ( sigMean0 < 0 )
5210          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5211         aSig0 = alternateASig0;
5212         aSig1 = alternateASig1;
5213     }
5214     zSign = ( (sbits64) aSig0 < 0 );
5215     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5216     return
5217         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5218 
5219 }
5220 
5221 /*
5222 -------------------------------------------------------------------------------
5223 Returns the square root of the quadruple-precision floating-point value `a'.
5224 The operation is performed according to the IEC/IEEE Standard for Binary
5225 Floating-Point Arithmetic.
5226 -------------------------------------------------------------------------------
5227 */
5228 float128 float128_sqrt( float128 a )
5229 {
5230     flag aSign;
5231     int32 aExp, zExp;
5232     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5233     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5234     float128 z;
5235 
5236     aSig1 = extractFloat128Frac1( a );
5237     aSig0 = extractFloat128Frac0( a );
5238     aExp = extractFloat128Exp( a );
5239     aSign = extractFloat128Sign( a );
5240     if ( aExp == 0x7FFF ) {
5241         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5242         if ( ! aSign ) return a;
5243         goto invalid;
5244     }
5245     if ( aSign ) {
5246         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5247  invalid:
5248         float_raise( float_flag_invalid );
5249         z.low = float128_default_nan_low;
5250         z.high = float128_default_nan_high;
5251         return z;
5252     }
5253     if ( aExp == 0 ) {
5254         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5255         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5256     }
5257     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5258     aSig0 |= LIT64( 0x0001000000000000 );
5259     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5260     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5261     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5262     doubleZSig0 = zSig0<<1;
5263     mul64To128( zSig0, zSig0, &term0, &term1 );
5264     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5265     while ( (sbits64) rem0 < 0 ) {
5266         --zSig0;
5267         doubleZSig0 -= 2;
5268         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5269     }
5270     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5271     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5272         if ( zSig1 == 0 ) zSig1 = 1;
5273         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5274         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5275         mul64To128( zSig1, zSig1, &term2, &term3 );
5276         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5277         while ( (sbits64) rem1 < 0 ) {
5278             --zSig1;
5279             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5280             term3 |= 1;
5281             term2 |= doubleZSig0;
5282             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5283         }
5284         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5285     }
5286     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5287     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5288 
5289 }
5290 
5291 /*
5292 -------------------------------------------------------------------------------
5293 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5294 the corresponding value `b', and 0 otherwise.  The comparison is performed
5295 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5296 -------------------------------------------------------------------------------
5297 */
5298 flag float128_eq( float128 a, float128 b )
5299 {
5300 
5301     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5302               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5303          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5304               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5305        ) {
5306         if (    float128_is_signaling_nan( a )
5307              || float128_is_signaling_nan( b ) ) {
5308             float_raise( float_flag_invalid );
5309         }
5310         return 0;
5311     }
5312     return
5313            ( a.low == b.low )
5314         && (    ( a.high == b.high )
5315              || (    ( a.low == 0 )
5316                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5317            );
5318 
5319 }
5320 
5321 /*
5322 -------------------------------------------------------------------------------
5323 Returns 1 if the quadruple-precision floating-point value `a' is less than
5324 or equal to the corresponding value `b', and 0 otherwise.  The comparison
5325 is performed according to the IEC/IEEE Standard for Binary Floating-Point
5326 Arithmetic.
5327 -------------------------------------------------------------------------------
5328 */
5329 flag float128_le( float128 a, float128 b )
5330 {
5331     flag aSign, bSign;
5332 
5333     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5334               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5335          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5336               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5337        ) {
5338         float_raise( float_flag_invalid );
5339         return 0;
5340     }
5341     aSign = extractFloat128Sign( a );
5342     bSign = extractFloat128Sign( b );
5343     if ( aSign != bSign ) {
5344         return
5345                aSign
5346             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5347                  == 0 );
5348     }
5349     return
5350           aSign ? le128( b.high, b.low, a.high, a.low )
5351         : le128( a.high, a.low, b.high, b.low );
5352 
5353 }
5354 
5355 /*
5356 -------------------------------------------------------------------------------
5357 Returns 1 if the quadruple-precision floating-point value `a' is less than
5358 the corresponding value `b', and 0 otherwise.  The comparison is performed
5359 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5360 -------------------------------------------------------------------------------
5361 */
5362 flag float128_lt( float128 a, float128 b )
5363 {
5364     flag aSign, bSign;
5365 
5366     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5367               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5368          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5369               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5370        ) {
5371         float_raise( float_flag_invalid );
5372         return 0;
5373     }
5374     aSign = extractFloat128Sign( a );
5375     bSign = extractFloat128Sign( b );
5376     if ( aSign != bSign ) {
5377         return
5378                aSign
5379             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5380                  != 0 );
5381     }
5382     return
5383           aSign ? lt128( b.high, b.low, a.high, a.low )
5384         : lt128( a.high, a.low, b.high, b.low );
5385 
5386 }
5387 
5388 /*
5389 -------------------------------------------------------------------------------
5390 Returns 1 if the quadruple-precision floating-point value `a' is equal to
5391 the corresponding value `b', and 0 otherwise.  The invalid exception is
5392 raised if either operand is a NaN.  Otherwise, the comparison is performed
5393 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5394 -------------------------------------------------------------------------------
5395 */
5396 flag float128_eq_signaling( float128 a, float128 b )
5397 {
5398 
5399     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5400               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5401          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5402               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5403        ) {
5404         float_raise( float_flag_invalid );
5405         return 0;
5406     }
5407     return
5408            ( a.low == b.low )
5409         && (    ( a.high == b.high )
5410              || (    ( a.low == 0 )
5411                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5412            );
5413 
5414 }
5415 
5416 /*
5417 -------------------------------------------------------------------------------
5418 Returns 1 if the quadruple-precision floating-point value `a' is less than
5419 or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5420 cause an exception.  Otherwise, the comparison is performed according to the
5421 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5422 -------------------------------------------------------------------------------
5423 */
5424 flag float128_le_quiet( float128 a, float128 b )
5425 {
5426     flag aSign, bSign;
5427 
5428     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5429               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5430          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5431               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5432        ) {
5433         if (    float128_is_signaling_nan( a )
5434              || float128_is_signaling_nan( b ) ) {
5435             float_raise( float_flag_invalid );
5436         }
5437         return 0;
5438     }
5439     aSign = extractFloat128Sign( a );
5440     bSign = extractFloat128Sign( b );
5441     if ( aSign != bSign ) {
5442         return
5443                aSign
5444             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5445                  == 0 );
5446     }
5447     return
5448           aSign ? le128( b.high, b.low, a.high, a.low )
5449         : le128( a.high, a.low, b.high, b.low );
5450 
5451 }
5452 
5453 /*
5454 -------------------------------------------------------------------------------
5455 Returns 1 if the quadruple-precision floating-point value `a' is less than
5456 the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5457 exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5458 Standard for Binary Floating-Point Arithmetic.
5459 -------------------------------------------------------------------------------
5460 */
5461 flag float128_lt_quiet( float128 a, float128 b )
5462 {
5463     flag aSign, bSign;
5464 
5465     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5466               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5467          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5468               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5469        ) {
5470         if (    float128_is_signaling_nan( a )
5471              || float128_is_signaling_nan( b ) ) {
5472             float_raise( float_flag_invalid );
5473         }
5474         return 0;
5475     }
5476     aSign = extractFloat128Sign( a );
5477     bSign = extractFloat128Sign( b );
5478     if ( aSign != bSign ) {
5479         return
5480                aSign
5481             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5482                  != 0 );
5483     }
5484     return
5485           aSign ? lt128( b.high, b.low, a.high, a.low )
5486         : lt128( a.high, a.low, b.high, b.low );
5487 
5488 }
5489 
5490 #endif
5491 
5492 
5493 #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5494 
5495 /*
5496  * These two routines are not part of the original softfloat distribution.
5497  *
5498  * They are based on the corresponding conversions to integer but return
5499  * unsigned numbers instead since these functions are required by GCC.
5500  *
5501  * Added by Mark Brinicombe <[email protected]>	27/09/97
5502  *
5503  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5504  */
5505 
5506 /*
5507 -------------------------------------------------------------------------------
5508 Returns the result of converting the double-precision floating-point value
5509 `a' to the 32-bit unsigned integer format.  The conversion is
5510 performed according to the IEC/IEEE Standard for Binary Floating-point
5511 Arithmetic, except that the conversion is always rounded toward zero.  If
5512 `a' is a NaN, the largest positive integer is returned.  If the conversion
5513 overflows, the largest integer positive is returned.
5514 -------------------------------------------------------------------------------
5515 */
5516 uint32 float64_to_uint32_round_to_zero( float64 a )
5517 {
5518     flag aSign;
5519     int16 aExp, shiftCount;
5520     bits64 aSig, savedASig;
5521     uint32 z;
5522 
5523     aSig = extractFloat64Frac( a );
5524     aExp = extractFloat64Exp( a );
5525     aSign = extractFloat64Sign( a );
5526 
5527     if (aSign) {
5528         float_raise( float_flag_invalid );
5529     	return(0);
5530     }
5531 
5532     if ( 0x41E < aExp ) {
5533         float_raise( float_flag_invalid );
5534         return 0xffffffff;
5535     }
5536     else if ( aExp < 0x3FF ) {
5537         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
5538         return 0;
5539     }
5540     aSig |= LIT64( 0x0010000000000000 );
5541     shiftCount = 0x433 - aExp;
5542     savedASig = aSig;
5543     aSig >>= shiftCount;
5544     z = aSig;
5545     if ( ( aSig<<shiftCount ) != savedASig ) {
5546         float_exception_flags |= float_flag_inexact;
5547     }
5548     return z;
5549 
5550 }
5551 
5552 /*
5553 -------------------------------------------------------------------------------
5554 Returns the result of converting the single-precision floating-point value
5555 `a' to the 32-bit unsigned integer format.  The conversion is
5556 performed according to the IEC/IEEE Standard for Binary Floating-point
5557 Arithmetic, except that the conversion is always rounded toward zero.  If
5558 `a' is a NaN, the largest positive integer is returned.  If the conversion
5559 overflows, the largest positive integer is returned.
5560 -------------------------------------------------------------------------------
5561 */
5562 uint32 float32_to_uint32_round_to_zero( float32 a )
5563 {
5564     flag aSign;
5565     int16 aExp, shiftCount;
5566     bits32 aSig;
5567     uint32 z;
5568 
5569     aSig = extractFloat32Frac( a );
5570     aExp = extractFloat32Exp( a );
5571     aSign = extractFloat32Sign( a );
5572     shiftCount = aExp - 0x9E;
5573 
5574     if (aSign) {
5575         float_raise( float_flag_invalid );
5576     	return(0);
5577     }
5578     if ( 0 < shiftCount ) {
5579         float_raise( float_flag_invalid );
5580         return 0xFFFFFFFF;
5581     }
5582     else if ( aExp <= 0x7E ) {
5583         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
5584         return 0;
5585     }
5586     aSig = ( aSig | 0x800000 )<<8;
5587     z = aSig>>( - shiftCount );
5588     if ( aSig<<( shiftCount & 31 ) ) {
5589         float_exception_flags |= float_flag_inexact;
5590     }
5591     return z;
5592 
5593 }
5594 
5595 #endif
5596