1 /* $NetBSD: softfloat.c,v 1.1 2002/05/21 23:51:07 bjh21 Exp $ */
2
3 /*
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
6 * itself).
7 */
8
9 /*
10 * Things you may want to define:
11 *
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
14 * properly renamed.
15 */
16
17 /*
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
21 * float64u.
22 */
23
24 /*
25 ===============================================================================
26
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
29
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
39
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
45
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
50
51 ===============================================================================
52 */
53
54 #include <sys/cdefs.h>
55 __FBSDID("$FreeBSD$");
56
57 #ifdef SOFTFLOAT_FOR_GCC
58 #include "softfloat-for-gcc.h"
59 #endif
60
61 #include "milieu.h"
62 #include "softfloat.h"
63
64 /*
65 * Conversions between floats as stored in memory and floats as
66 * SoftFloat uses them
67 */
68 #ifndef FLOAT64_DEMANGLE
69 #define FLOAT64_DEMANGLE(a) (a)
70 #endif
71 #ifndef FLOAT64_MANGLE
72 #define FLOAT64_MANGLE(a) (a)
73 #endif
74
75 /*
76 -------------------------------------------------------------------------------
77 Floating-point rounding mode and exception flags.
78 -------------------------------------------------------------------------------
79 */
80 int float_rounding_mode = float_round_nearest_even;
81 int float_exception_flags = 0;
82
83 /*
84 -------------------------------------------------------------------------------
85 Primitive arithmetic functions, including multi-word arithmetic, and
86 division and square root approximations. (Can be specialized to target if
87 desired.)
88 -------------------------------------------------------------------------------
89 */
90 #include "softfloat-macros"
91
92 /*
93 -------------------------------------------------------------------------------
94 Functions and definitions to determine: (1) whether tininess for underflow
95 is detected before or after rounding by default, (2) what (if anything)
96 happens when exceptions are raised, (3) how signaling NaNs are distinguished
97 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
98 are propagated from function inputs to output. These details are target-
99 specific.
100 -------------------------------------------------------------------------------
101 */
102 #include "softfloat-specialize"
103
104 /*
105 -------------------------------------------------------------------------------
106 Returns the fraction bits of the single-precision floating-point value `a'.
107 -------------------------------------------------------------------------------
108 */
extractFloat32Frac(float32 a)109 INLINE bits32 extractFloat32Frac( float32 a )
110 {
111
112 return a & 0x007FFFFF;
113
114 }
115
116 /*
117 -------------------------------------------------------------------------------
118 Returns the exponent bits of the single-precision floating-point value `a'.
119 -------------------------------------------------------------------------------
120 */
extractFloat32Exp(float32 a)121 INLINE int16 extractFloat32Exp( float32 a )
122 {
123
124 return ( a>>23 ) & 0xFF;
125
126 }
127
128 /*
129 -------------------------------------------------------------------------------
130 Returns the sign bit of the single-precision floating-point value `a'.
131 -------------------------------------------------------------------------------
132 */
extractFloat32Sign(float32 a)133 INLINE flag extractFloat32Sign( float32 a )
134 {
135
136 return a>>31;
137
138 }
139
140 /*
141 -------------------------------------------------------------------------------
142 Normalizes the subnormal single-precision floating-point value represented
143 by the denormalized significand `aSig'. The normalized exponent and
144 significand are stored at the locations pointed to by `zExpPtr' and
145 `zSigPtr', respectively.
146 -------------------------------------------------------------------------------
147 */
148 static void
normalizeFloat32Subnormal(bits32 aSig,int16 * zExpPtr,bits32 * zSigPtr)149 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
150 {
151 int8 shiftCount;
152
153 shiftCount = countLeadingZeros32( aSig ) - 8;
154 *zSigPtr = aSig<<shiftCount;
155 *zExpPtr = 1 - shiftCount;
156
157 }
158
159 /*
160 -------------------------------------------------------------------------------
161 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
162 single-precision floating-point value, returning the result. After being
163 shifted into the proper positions, the three fields are simply added
164 together to form the result. This means that any integer portion of `zSig'
165 will be added into the exponent. Since a properly normalized significand
166 will have an integer portion equal to 1, the `zExp' input should be 1 less
167 than the desired result exponent whenever `zSig' is a complete, normalized
168 significand.
169 -------------------------------------------------------------------------------
170 */
packFloat32(flag zSign,int16 zExp,bits32 zSig)171 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
172 {
173
174 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
175
176 }
177
178 /*
179 -------------------------------------------------------------------------------
180 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
181 and significand `zSig', and returns the proper single-precision floating-
182 point value corresponding to the abstract input. Ordinarily, the abstract
183 value is simply rounded and packed into the single-precision format, with
184 the inexact exception raised if the abstract input cannot be represented
185 exactly. However, if the abstract value is too large, the overflow and
186 inexact exceptions are raised and an infinity or maximal finite value is
187 returned. If the abstract value is too small, the input value is rounded to
188 a subnormal number, and the underflow and inexact exceptions are raised if
189 the abstract input cannot be represented exactly as a subnormal single-
190 precision floating-point number.
191 The input significand `zSig' has its binary point between bits 30
192 and 29, which is 7 bits to the left of the usual location. This shifted
193 significand must be normalized or smaller. If `zSig' is not normalized,
194 `zExp' must be 0; in that case, the result returned is a subnormal number,
195 and it must not require rounding. In the usual case that `zSig' is
196 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
197 The handling of underflow and overflow follows the IEC/IEEE Standard for
198 Binary Floating-Point Arithmetic.
199 -------------------------------------------------------------------------------
200 */
roundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)201 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
202 {
203 int8 roundingMode;
204 flag roundNearestEven;
205 int8 roundIncrement, roundBits;
206 flag isTiny;
207
208 roundingMode = float_rounding_mode;
209 roundNearestEven = roundingMode == float_round_nearest_even;
210 roundIncrement = 0x40;
211 if ( ! roundNearestEven ) {
212 if ( roundingMode == float_round_to_zero ) {
213 roundIncrement = 0;
214 }
215 else {
216 roundIncrement = 0x7F;
217 if ( zSign ) {
218 if ( roundingMode == float_round_up ) roundIncrement = 0;
219 }
220 else {
221 if ( roundingMode == float_round_down ) roundIncrement = 0;
222 }
223 }
224 }
225 roundBits = zSig & 0x7F;
226 if ( 0xFD <= (bits16) zExp ) {
227 if ( ( 0xFD < zExp )
228 || ( ( zExp == 0xFD )
229 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
230 ) {
231 float_raise( float_flag_overflow | float_flag_inexact );
232 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
233 }
234 if ( zExp < 0 ) {
235 isTiny =
236 ( float_detect_tininess == float_tininess_before_rounding )
237 || ( zExp < -1 )
238 || ( zSig + roundIncrement < 0x80000000 );
239 shift32RightJamming( zSig, - zExp, &zSig );
240 zExp = 0;
241 roundBits = zSig & 0x7F;
242 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
243 }
244 }
245 if ( roundBits ) float_exception_flags |= float_flag_inexact;
246 zSig = ( zSig + roundIncrement )>>7;
247 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
248 if ( zSig == 0 ) zExp = 0;
249 return packFloat32( zSign, zExp, zSig );
250
251 }
252
253 /*
254 -------------------------------------------------------------------------------
255 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
256 and significand `zSig', and returns the proper single-precision floating-
257 point value corresponding to the abstract input. This routine is just like
258 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
259 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
260 floating-point exponent.
261 -------------------------------------------------------------------------------
262 */
263 static float32
normalizeRoundAndPackFloat32(flag zSign,int16 zExp,bits32 zSig)264 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
265 {
266 int8 shiftCount;
267
268 shiftCount = countLeadingZeros32( zSig ) - 1;
269 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
270
271 }
272
273 /*
274 -------------------------------------------------------------------------------
275 Returns the least-significant 32 fraction bits of the double-precision
276 floating-point value `a'.
277 -------------------------------------------------------------------------------
278 */
extractFloat64Frac1(float64 a)279 INLINE bits32 extractFloat64Frac1( float64 a )
280 {
281
282 return FLOAT64_DEMANGLE(a) & LIT64( 0x00000000FFFFFFFF );
283
284 }
285
286 /*
287 -------------------------------------------------------------------------------
288 Returns the most-significant 20 fraction bits of the double-precision
289 floating-point value `a'.
290 -------------------------------------------------------------------------------
291 */
extractFloat64Frac0(float64 a)292 INLINE bits32 extractFloat64Frac0( float64 a )
293 {
294
295 return ( FLOAT64_DEMANGLE(a)>>32 ) & 0x000FFFFF;
296
297 }
298
299 /*
300 -------------------------------------------------------------------------------
301 Returns the exponent bits of the double-precision floating-point value `a'.
302 -------------------------------------------------------------------------------
303 */
extractFloat64Exp(float64 a)304 INLINE int16 extractFloat64Exp( float64 a )
305 {
306
307 return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
308
309 }
310
311 /*
312 -------------------------------------------------------------------------------
313 Returns the sign bit of the double-precision floating-point value `a'.
314 -------------------------------------------------------------------------------
315 */
extractFloat64Sign(float64 a)316 INLINE flag extractFloat64Sign( float64 a )
317 {
318
319 return FLOAT64_DEMANGLE(a)>>63;
320
321 }
322
323 /*
324 -------------------------------------------------------------------------------
325 Normalizes the subnormal double-precision floating-point value represented
326 by the denormalized significand formed by the concatenation of `aSig0' and
327 `aSig1'. The normalized exponent is stored at the location pointed to by
328 `zExpPtr'. The most significant 21 bits of the normalized significand are
329 stored at the location pointed to by `zSig0Ptr', and the least significant
330 32 bits of the normalized significand are stored at the location pointed to
331 by `zSig1Ptr'.
332 -------------------------------------------------------------------------------
333 */
334 static void
normalizeFloat64Subnormal(bits32 aSig0,bits32 aSig1,int16 * zExpPtr,bits32 * zSig0Ptr,bits32 * zSig1Ptr)335 normalizeFloat64Subnormal(
336 bits32 aSig0,
337 bits32 aSig1,
338 int16 *zExpPtr,
339 bits32 *zSig0Ptr,
340 bits32 *zSig1Ptr
341 )
342 {
343 int8 shiftCount;
344
345 if ( aSig0 == 0 ) {
346 shiftCount = countLeadingZeros32( aSig1 ) - 11;
347 if ( shiftCount < 0 ) {
348 *zSig0Ptr = aSig1>>( - shiftCount );
349 *zSig1Ptr = aSig1<<( shiftCount & 31 );
350 }
351 else {
352 *zSig0Ptr = aSig1<<shiftCount;
353 *zSig1Ptr = 0;
354 }
355 *zExpPtr = - shiftCount - 31;
356 }
357 else {
358 shiftCount = countLeadingZeros32( aSig0 ) - 11;
359 shortShift64Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
360 *zExpPtr = 1 - shiftCount;
361 }
362
363 }
364
365 /*
366 -------------------------------------------------------------------------------
367 Packs the sign `zSign', the exponent `zExp', and the significand formed by
368 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
369 point value, returning the result. After being shifted into the proper
370 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
371 together to form the most significant 32 bits of the result. This means
372 that any integer portion of `zSig0' will be added into the exponent. Since
373 a properly normalized significand will have an integer portion equal to 1,
374 the `zExp' input should be 1 less than the desired result exponent whenever
375 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
376 -------------------------------------------------------------------------------
377 */
378 INLINE float64
packFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)379 packFloat64( flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
380 {
381
382 return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
383 ( ( (bits64) zExp )<<52 ) +
384 ( ( (bits64) zSig0 )<<32 ) + zSig1 );
385
386
387 }
388
389 /*
390 -------------------------------------------------------------------------------
391 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
392 and extended significand formed by the concatenation of `zSig0', `zSig1',
393 and `zSig2', and returns the proper double-precision floating-point value
394 corresponding to the abstract input. Ordinarily, the abstract value is
395 simply rounded and packed into the double-precision format, with the inexact
396 exception raised if the abstract input cannot be represented exactly.
397 However, if the abstract value is too large, the overflow and inexact
398 exceptions are raised and an infinity or maximal finite value is returned.
399 If the abstract value is too small, the input value is rounded to a
400 subnormal number, and the underflow and inexact exceptions are raised if the
401 abstract input cannot be represented exactly as a subnormal double-precision
402 floating-point number.
403 The input significand must be normalized or smaller. If the input
404 significand is not normalized, `zExp' must be 0; in that case, the result
405 returned is a subnormal number, and it must not require rounding. In the
406 usual case that the input significand is normalized, `zExp' must be 1 less
407 than the ``true'' floating-point exponent. The handling of underflow and
408 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
409 -------------------------------------------------------------------------------
410 */
411 static float64
roundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1,bits32 zSig2)412 roundAndPackFloat64(
413 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1, bits32 zSig2 )
414 {
415 int8 roundingMode;
416 flag roundNearestEven, increment, isTiny;
417
418 roundingMode = float_rounding_mode;
419 roundNearestEven = ( roundingMode == float_round_nearest_even );
420 increment = ( (sbits32) zSig2 < 0 );
421 if ( ! roundNearestEven ) {
422 if ( roundingMode == float_round_to_zero ) {
423 increment = 0;
424 }
425 else {
426 if ( zSign ) {
427 increment = ( roundingMode == float_round_down ) && zSig2;
428 }
429 else {
430 increment = ( roundingMode == float_round_up ) && zSig2;
431 }
432 }
433 }
434 if ( 0x7FD <= (bits16) zExp ) {
435 if ( ( 0x7FD < zExp )
436 || ( ( zExp == 0x7FD )
437 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0, zSig1 )
438 && increment
439 )
440 ) {
441 float_raise( float_flag_overflow | float_flag_inexact );
442 if ( ( roundingMode == float_round_to_zero )
443 || ( zSign && ( roundingMode == float_round_up ) )
444 || ( ! zSign && ( roundingMode == float_round_down ) )
445 ) {
446 return packFloat64( zSign, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
447 }
448 return packFloat64( zSign, 0x7FF, 0, 0 );
449 }
450 if ( zExp < 0 ) {
451 isTiny =
452 ( float_detect_tininess == float_tininess_before_rounding )
453 || ( zExp < -1 )
454 || ! increment
455 || lt64( zSig0, zSig1, 0x001FFFFF, 0xFFFFFFFF );
456 shift64ExtraRightJamming(
457 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
458 zExp = 0;
459 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
460 if ( roundNearestEven ) {
461 increment = ( (sbits32) zSig2 < 0 );
462 }
463 else {
464 if ( zSign ) {
465 increment = ( roundingMode == float_round_down ) && zSig2;
466 }
467 else {
468 increment = ( roundingMode == float_round_up ) && zSig2;
469 }
470 }
471 }
472 }
473 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
474 if ( increment ) {
475 add64( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
476 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
477 }
478 else {
479 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
480 }
481 return packFloat64( zSign, zExp, zSig0, zSig1 );
482
483 }
484
485 /*
486 -------------------------------------------------------------------------------
487 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
488 and significand formed by the concatenation of `zSig0' and `zSig1', and
489 returns the proper double-precision floating-point value corresponding
490 to the abstract input. This routine is just like `roundAndPackFloat64'
491 except that the input significand has fewer bits and does not have to be
492 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
493 point exponent.
494 -------------------------------------------------------------------------------
495 */
496 static float64
normalizeRoundAndPackFloat64(flag zSign,int16 zExp,bits32 zSig0,bits32 zSig1)497 normalizeRoundAndPackFloat64(
498 flag zSign, int16 zExp, bits32 zSig0, bits32 zSig1 )
499 {
500 int8 shiftCount;
501 bits32 zSig2;
502
503 if ( zSig0 == 0 ) {
504 zSig0 = zSig1;
505 zSig1 = 0;
506 zExp -= 32;
507 }
508 shiftCount = countLeadingZeros32( zSig0 ) - 11;
509 if ( 0 <= shiftCount ) {
510 zSig2 = 0;
511 shortShift64Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
512 }
513 else {
514 shift64ExtraRightJamming(
515 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
516 }
517 zExp -= shiftCount;
518 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
519
520 }
521
522 /*
523 -------------------------------------------------------------------------------
524 Returns the result of converting the 32-bit two's complement integer `a' to
525 the single-precision floating-point format. The conversion is performed
526 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
527 -------------------------------------------------------------------------------
528 */
int32_to_float32(int32 a)529 float32 int32_to_float32( int32 a )
530 {
531 flag zSign;
532
533 if ( a == 0 ) return 0;
534 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
535 zSign = ( a < 0 );
536 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
537
538 }
539
540 /*
541 -------------------------------------------------------------------------------
542 Returns the result of converting the 32-bit two's complement integer `a' to
543 the double-precision floating-point format. The conversion is performed
544 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
545 -------------------------------------------------------------------------------
546 */
int32_to_float64(int32 a)547 float64 int32_to_float64( int32 a )
548 {
549 flag zSign;
550 bits32 absA;
551 int8 shiftCount;
552 bits32 zSig0, zSig1;
553
554 if ( a == 0 ) return packFloat64( 0, 0, 0, 0 );
555 zSign = ( a < 0 );
556 absA = zSign ? - a : a;
557 shiftCount = countLeadingZeros32( absA ) - 11;
558 if ( 0 <= shiftCount ) {
559 zSig0 = absA<<shiftCount;
560 zSig1 = 0;
561 }
562 else {
563 shift64Right( absA, 0, - shiftCount, &zSig0, &zSig1 );
564 }
565 return packFloat64( zSign, 0x412 - shiftCount, zSig0, zSig1 );
566
567 }
568
569 #ifndef SOFTFLOAT_FOR_GCC
570 /*
571 -------------------------------------------------------------------------------
572 Returns the result of converting the single-precision floating-point value
573 `a' to the 32-bit two's complement integer format. The conversion is
574 performed according to the IEC/IEEE Standard for Binary Floating-Point
575 Arithmetic---which means in particular that the conversion is rounded
576 according to the current rounding mode. If `a' is a NaN, the largest
577 positive integer is returned. Otherwise, if the conversion overflows, the
578 largest integer with the same sign as `a' is returned.
579 -------------------------------------------------------------------------------
580 */
float32_to_int32(float32 a)581 int32 float32_to_int32( float32 a )
582 {
583 flag aSign;
584 int16 aExp, shiftCount;
585 bits32 aSig, aSigExtra;
586 int32 z;
587 int8 roundingMode;
588
589 aSig = extractFloat32Frac( a );
590 aExp = extractFloat32Exp( a );
591 aSign = extractFloat32Sign( a );
592 shiftCount = aExp - 0x96;
593 if ( 0 <= shiftCount ) {
594 if ( 0x9E <= aExp ) {
595 if ( a != 0xCF000000 ) {
596 float_raise( float_flag_invalid );
597 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
598 return 0x7FFFFFFF;
599 }
600 }
601 return (sbits32) 0x80000000;
602 }
603 z = ( aSig | 0x00800000 )<<shiftCount;
604 if ( aSign ) z = - z;
605 }
606 else {
607 if ( aExp < 0x7E ) {
608 aSigExtra = aExp | aSig;
609 z = 0;
610 }
611 else {
612 aSig |= 0x00800000;
613 aSigExtra = aSig<<( shiftCount & 31 );
614 z = aSig>>( - shiftCount );
615 }
616 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
617 roundingMode = float_rounding_mode;
618 if ( roundingMode == float_round_nearest_even ) {
619 if ( (sbits32) aSigExtra < 0 ) {
620 ++z;
621 if ( (bits32) ( aSigExtra<<1 ) == 0 ) z &= ~1;
622 }
623 if ( aSign ) z = - z;
624 }
625 else {
626 aSigExtra = ( aSigExtra != 0 );
627 if ( aSign ) {
628 z += ( roundingMode == float_round_down ) & aSigExtra;
629 z = - z;
630 }
631 else {
632 z += ( roundingMode == float_round_up ) & aSigExtra;
633 }
634 }
635 }
636 return z;
637
638 }
639 #endif
640
641 /*
642 -------------------------------------------------------------------------------
643 Returns the result of converting the single-precision floating-point value
644 `a' to the 32-bit two's complement integer format. The conversion is
645 performed according to the IEC/IEEE Standard for Binary Floating-Point
646 Arithmetic, except that the conversion is always rounded toward zero.
647 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
648 the conversion overflows, the largest integer with the same sign as `a' is
649 returned.
650 -------------------------------------------------------------------------------
651 */
float32_to_int32_round_to_zero(float32 a)652 int32 float32_to_int32_round_to_zero( float32 a )
653 {
654 flag aSign;
655 int16 aExp, shiftCount;
656 bits32 aSig;
657 int32 z;
658
659 aSig = extractFloat32Frac( a );
660 aExp = extractFloat32Exp( a );
661 aSign = extractFloat32Sign( a );
662 shiftCount = aExp - 0x9E;
663 if ( 0 <= shiftCount ) {
664 if ( a != 0xCF000000 ) {
665 float_raise( float_flag_invalid );
666 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
667 }
668 return (sbits32) 0x80000000;
669 }
670 else if ( aExp <= 0x7E ) {
671 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
672 return 0;
673 }
674 aSig = ( aSig | 0x00800000 )<<8;
675 z = aSig>>( - shiftCount );
676 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
677 float_exception_flags |= float_flag_inexact;
678 }
679 if ( aSign ) z = - z;
680 return z;
681
682 }
683
684 /*
685 -------------------------------------------------------------------------------
686 Returns the result of converting the single-precision floating-point value
687 `a' to the double-precision floating-point format. The conversion is
688 performed according to the IEC/IEEE Standard for Binary Floating-Point
689 Arithmetic.
690 -------------------------------------------------------------------------------
691 */
float32_to_float64(float32 a)692 float64 float32_to_float64( float32 a )
693 {
694 flag aSign;
695 int16 aExp;
696 bits32 aSig, zSig0, zSig1;
697
698 aSig = extractFloat32Frac( a );
699 aExp = extractFloat32Exp( a );
700 aSign = extractFloat32Sign( a );
701 if ( aExp == 0xFF ) {
702 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
703 return packFloat64( aSign, 0x7FF, 0, 0 );
704 }
705 if ( aExp == 0 ) {
706 if ( aSig == 0 ) return packFloat64( aSign, 0, 0, 0 );
707 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
708 --aExp;
709 }
710 shift64Right( aSig, 0, 3, &zSig0, &zSig1 );
711 return packFloat64( aSign, aExp + 0x380, zSig0, zSig1 );
712
713 }
714
715 #ifndef SOFTFLOAT_FOR_GCC
716 /*
717 -------------------------------------------------------------------------------
718 Rounds the single-precision floating-point value `a' to an integer,
719 and returns the result as a single-precision floating-point value. The
720 operation is performed according to the IEC/IEEE Standard for Binary
721 Floating-Point Arithmetic.
722 -------------------------------------------------------------------------------
723 */
float32_round_to_int(float32 a)724 float32 float32_round_to_int( float32 a )
725 {
726 flag aSign;
727 int16 aExp;
728 bits32 lastBitMask, roundBitsMask;
729 int8 roundingMode;
730 float32 z;
731
732 aExp = extractFloat32Exp( a );
733 if ( 0x96 <= aExp ) {
734 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
735 return propagateFloat32NaN( a, a );
736 }
737 return a;
738 }
739 if ( aExp <= 0x7E ) {
740 if ( (bits32) ( a<<1 ) == 0 ) return a;
741 float_exception_flags |= float_flag_inexact;
742 aSign = extractFloat32Sign( a );
743 switch ( float_rounding_mode ) {
744 case float_round_nearest_even:
745 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
746 return packFloat32( aSign, 0x7F, 0 );
747 }
748 break;
749 case float_round_to_zero:
750 break;
751 case float_round_down:
752 return aSign ? 0xBF800000 : 0;
753 case float_round_up:
754 return aSign ? 0x80000000 : 0x3F800000;
755 }
756 return packFloat32( aSign, 0, 0 );
757 }
758 lastBitMask = 1;
759 lastBitMask <<= 0x96 - aExp;
760 roundBitsMask = lastBitMask - 1;
761 z = a;
762 roundingMode = float_rounding_mode;
763 if ( roundingMode == float_round_nearest_even ) {
764 z += lastBitMask>>1;
765 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
766 }
767 else if ( roundingMode != float_round_to_zero ) {
768 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
769 z += roundBitsMask;
770 }
771 }
772 z &= ~ roundBitsMask;
773 if ( z != a ) float_exception_flags |= float_flag_inexact;
774 return z;
775
776 }
777 #endif
778
779 /*
780 -------------------------------------------------------------------------------
781 Returns the result of adding the absolute values of the single-precision
782 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
783 before being returned. `zSign' is ignored if the result is a NaN.
784 The addition is performed according to the IEC/IEEE Standard for Binary
785 Floating-Point Arithmetic.
786 -------------------------------------------------------------------------------
787 */
addFloat32Sigs(float32 a,float32 b,flag zSign)788 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
789 {
790 int16 aExp, bExp, zExp;
791 bits32 aSig, bSig, zSig;
792 int16 expDiff;
793
794 aSig = extractFloat32Frac( a );
795 aExp = extractFloat32Exp( a );
796 bSig = extractFloat32Frac( b );
797 bExp = extractFloat32Exp( b );
798 expDiff = aExp - bExp;
799 aSig <<= 6;
800 bSig <<= 6;
801 if ( 0 < expDiff ) {
802 if ( aExp == 0xFF ) {
803 if ( aSig ) return propagateFloat32NaN( a, b );
804 return a;
805 }
806 if ( bExp == 0 ) {
807 --expDiff;
808 }
809 else {
810 bSig |= 0x20000000;
811 }
812 shift32RightJamming( bSig, expDiff, &bSig );
813 zExp = aExp;
814 }
815 else if ( expDiff < 0 ) {
816 if ( bExp == 0xFF ) {
817 if ( bSig ) return propagateFloat32NaN( a, b );
818 return packFloat32( zSign, 0xFF, 0 );
819 }
820 if ( aExp == 0 ) {
821 ++expDiff;
822 }
823 else {
824 aSig |= 0x20000000;
825 }
826 shift32RightJamming( aSig, - expDiff, &aSig );
827 zExp = bExp;
828 }
829 else {
830 if ( aExp == 0xFF ) {
831 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
832 return a;
833 }
834 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
835 zSig = 0x40000000 + aSig + bSig;
836 zExp = aExp;
837 goto roundAndPack;
838 }
839 aSig |= 0x20000000;
840 zSig = ( aSig + bSig )<<1;
841 --zExp;
842 if ( (sbits32) zSig < 0 ) {
843 zSig = aSig + bSig;
844 ++zExp;
845 }
846 roundAndPack:
847 return roundAndPackFloat32( zSign, zExp, zSig );
848
849 }
850
851 /*
852 -------------------------------------------------------------------------------
853 Returns the result of subtracting the absolute values of the single-
854 precision floating-point values `a' and `b'. If `zSign' is 1, the
855 difference is negated before being returned. `zSign' is ignored if the
856 result is a NaN. The subtraction is performed according to the IEC/IEEE
857 Standard for Binary Floating-Point Arithmetic.
858 -------------------------------------------------------------------------------
859 */
subFloat32Sigs(float32 a,float32 b,flag zSign)860 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
861 {
862 int16 aExp, bExp, zExp;
863 bits32 aSig, bSig, zSig;
864 int16 expDiff;
865
866 aSig = extractFloat32Frac( a );
867 aExp = extractFloat32Exp( a );
868 bSig = extractFloat32Frac( b );
869 bExp = extractFloat32Exp( b );
870 expDiff = aExp - bExp;
871 aSig <<= 7;
872 bSig <<= 7;
873 if ( 0 < expDiff ) goto aExpBigger;
874 if ( expDiff < 0 ) goto bExpBigger;
875 if ( aExp == 0xFF ) {
876 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
877 float_raise( float_flag_invalid );
878 return float32_default_nan;
879 }
880 if ( aExp == 0 ) {
881 aExp = 1;
882 bExp = 1;
883 }
884 if ( bSig < aSig ) goto aBigger;
885 if ( aSig < bSig ) goto bBigger;
886 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
887 bExpBigger:
888 if ( bExp == 0xFF ) {
889 if ( bSig ) return propagateFloat32NaN( a, b );
890 return packFloat32( zSign ^ 1, 0xFF, 0 );
891 }
892 if ( aExp == 0 ) {
893 ++expDiff;
894 }
895 else {
896 aSig |= 0x40000000;
897 }
898 shift32RightJamming( aSig, - expDiff, &aSig );
899 bSig |= 0x40000000;
900 bBigger:
901 zSig = bSig - aSig;
902 zExp = bExp;
903 zSign ^= 1;
904 goto normalizeRoundAndPack;
905 aExpBigger:
906 if ( aExp == 0xFF ) {
907 if ( aSig ) return propagateFloat32NaN( a, b );
908 return a;
909 }
910 if ( bExp == 0 ) {
911 --expDiff;
912 }
913 else {
914 bSig |= 0x40000000;
915 }
916 shift32RightJamming( bSig, expDiff, &bSig );
917 aSig |= 0x40000000;
918 aBigger:
919 zSig = aSig - bSig;
920 zExp = aExp;
921 normalizeRoundAndPack:
922 --zExp;
923 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
924
925 }
926
927 /*
928 -------------------------------------------------------------------------------
929 Returns the result of adding the single-precision floating-point values `a'
930 and `b'. The operation is performed according to the IEC/IEEE Standard for
931 Binary Floating-Point Arithmetic.
932 -------------------------------------------------------------------------------
933 */
float32_add(float32 a,float32 b)934 float32 float32_add( float32 a, float32 b )
935 {
936 flag aSign, bSign;
937
938 aSign = extractFloat32Sign( a );
939 bSign = extractFloat32Sign( b );
940 if ( aSign == bSign ) {
941 return addFloat32Sigs( a, b, aSign );
942 }
943 else {
944 return subFloat32Sigs( a, b, aSign );
945 }
946
947 }
948
949 /*
950 -------------------------------------------------------------------------------
951 Returns the result of subtracting the single-precision floating-point values
952 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
953 for Binary Floating-Point Arithmetic.
954 -------------------------------------------------------------------------------
955 */
float32_sub(float32 a,float32 b)956 float32 float32_sub( float32 a, float32 b )
957 {
958 flag aSign, bSign;
959
960 aSign = extractFloat32Sign( a );
961 bSign = extractFloat32Sign( b );
962 if ( aSign == bSign ) {
963 return subFloat32Sigs( a, b, aSign );
964 }
965 else {
966 return addFloat32Sigs( a, b, aSign );
967 }
968
969 }
970
971 /*
972 -------------------------------------------------------------------------------
973 Returns the result of multiplying the single-precision floating-point values
974 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
975 for Binary Floating-Point Arithmetic.
976 -------------------------------------------------------------------------------
977 */
float32_mul(float32 a,float32 b)978 float32 float32_mul( float32 a, float32 b )
979 {
980 flag aSign, bSign, zSign;
981 int16 aExp, bExp, zExp;
982 bits32 aSig, bSig, zSig0, zSig1;
983
984 aSig = extractFloat32Frac( a );
985 aExp = extractFloat32Exp( a );
986 aSign = extractFloat32Sign( a );
987 bSig = extractFloat32Frac( b );
988 bExp = extractFloat32Exp( b );
989 bSign = extractFloat32Sign( b );
990 zSign = aSign ^ bSign;
991 if ( aExp == 0xFF ) {
992 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
993 return propagateFloat32NaN( a, b );
994 }
995 if ( ( bExp | bSig ) == 0 ) {
996 float_raise( float_flag_invalid );
997 return float32_default_nan;
998 }
999 return packFloat32( zSign, 0xFF, 0 );
1000 }
1001 if ( bExp == 0xFF ) {
1002 if ( bSig ) return propagateFloat32NaN( a, b );
1003 if ( ( aExp | aSig ) == 0 ) {
1004 float_raise( float_flag_invalid );
1005 return float32_default_nan;
1006 }
1007 return packFloat32( zSign, 0xFF, 0 );
1008 }
1009 if ( aExp == 0 ) {
1010 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1011 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1012 }
1013 if ( bExp == 0 ) {
1014 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1015 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1016 }
1017 zExp = aExp + bExp - 0x7F;
1018 aSig = ( aSig | 0x00800000 )<<7;
1019 bSig = ( bSig | 0x00800000 )<<8;
1020 mul32To64( aSig, bSig, &zSig0, &zSig1 );
1021 zSig0 |= ( zSig1 != 0 );
1022 if ( 0 <= (sbits32) ( zSig0<<1 ) ) {
1023 zSig0 <<= 1;
1024 --zExp;
1025 }
1026 return roundAndPackFloat32( zSign, zExp, zSig0 );
1027
1028 }
1029
1030 /*
1031 -------------------------------------------------------------------------------
1032 Returns the result of dividing the single-precision floating-point value `a'
1033 by the corresponding value `b'. The operation is performed according to the
1034 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1035 -------------------------------------------------------------------------------
1036 */
float32_div(float32 a,float32 b)1037 float32 float32_div( float32 a, float32 b )
1038 {
1039 flag aSign, bSign, zSign;
1040 int16 aExp, bExp, zExp;
1041 bits32 aSig, bSig, zSig, rem0, rem1, term0, term1;
1042
1043 aSig = extractFloat32Frac( a );
1044 aExp = extractFloat32Exp( a );
1045 aSign = extractFloat32Sign( a );
1046 bSig = extractFloat32Frac( b );
1047 bExp = extractFloat32Exp( b );
1048 bSign = extractFloat32Sign( b );
1049 zSign = aSign ^ bSign;
1050 if ( aExp == 0xFF ) {
1051 if ( aSig ) return propagateFloat32NaN( a, b );
1052 if ( bExp == 0xFF ) {
1053 if ( bSig ) return propagateFloat32NaN( a, b );
1054 float_raise( float_flag_invalid );
1055 return float32_default_nan;
1056 }
1057 return packFloat32( zSign, 0xFF, 0 );
1058 }
1059 if ( bExp == 0xFF ) {
1060 if ( bSig ) return propagateFloat32NaN( a, b );
1061 return packFloat32( zSign, 0, 0 );
1062 }
1063 if ( bExp == 0 ) {
1064 if ( bSig == 0 ) {
1065 if ( ( aExp | aSig ) == 0 ) {
1066 float_raise( float_flag_invalid );
1067 return float32_default_nan;
1068 }
1069 float_raise( float_flag_divbyzero );
1070 return packFloat32( zSign, 0xFF, 0 );
1071 }
1072 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1073 }
1074 if ( aExp == 0 ) {
1075 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1076 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1077 }
1078 zExp = aExp - bExp + 0x7D;
1079 aSig = ( aSig | 0x00800000 )<<7;
1080 bSig = ( bSig | 0x00800000 )<<8;
1081 if ( bSig <= ( aSig + aSig ) ) {
1082 aSig >>= 1;
1083 ++zExp;
1084 }
1085 zSig = estimateDiv64To32( aSig, 0, bSig );
1086 if ( ( zSig & 0x3F ) <= 2 ) {
1087 mul32To64( bSig, zSig, &term0, &term1 );
1088 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1089 while ( (sbits32) rem0 < 0 ) {
1090 --zSig;
1091 add64( rem0, rem1, 0, bSig, &rem0, &rem1 );
1092 }
1093 zSig |= ( rem1 != 0 );
1094 }
1095 return roundAndPackFloat32( zSign, zExp, zSig );
1096
1097 }
1098
1099 #ifndef SOFTFLOAT_FOR_GCC
1100 /*
1101 -------------------------------------------------------------------------------
1102 Returns the remainder of the single-precision floating-point value `a'
1103 with respect to the corresponding value `b'. The operation is performed
1104 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1105 -------------------------------------------------------------------------------
1106 */
float32_rem(float32 a,float32 b)1107 float32 float32_rem( float32 a, float32 b )
1108 {
1109 flag aSign, bSign, zSign;
1110 int16 aExp, bExp, expDiff;
1111 bits32 aSig, bSig, q, allZero, alternateASig;
1112 sbits32 sigMean;
1113
1114 aSig = extractFloat32Frac( a );
1115 aExp = extractFloat32Exp( a );
1116 aSign = extractFloat32Sign( a );
1117 bSig = extractFloat32Frac( b );
1118 bExp = extractFloat32Exp( b );
1119 bSign = extractFloat32Sign( b );
1120 if ( aExp == 0xFF ) {
1121 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1122 return propagateFloat32NaN( a, b );
1123 }
1124 float_raise( float_flag_invalid );
1125 return float32_default_nan;
1126 }
1127 if ( bExp == 0xFF ) {
1128 if ( bSig ) return propagateFloat32NaN( a, b );
1129 return a;
1130 }
1131 if ( bExp == 0 ) {
1132 if ( bSig == 0 ) {
1133 float_raise( float_flag_invalid );
1134 return float32_default_nan;
1135 }
1136 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1137 }
1138 if ( aExp == 0 ) {
1139 if ( aSig == 0 ) return a;
1140 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1141 }
1142 expDiff = aExp - bExp;
1143 aSig = ( aSig | 0x00800000 )<<8;
1144 bSig = ( bSig | 0x00800000 )<<8;
1145 if ( expDiff < 0 ) {
1146 if ( expDiff < -1 ) return a;
1147 aSig >>= 1;
1148 }
1149 q = ( bSig <= aSig );
1150 if ( q ) aSig -= bSig;
1151 expDiff -= 32;
1152 while ( 0 < expDiff ) {
1153 q = estimateDiv64To32( aSig, 0, bSig );
1154 q = ( 2 < q ) ? q - 2 : 0;
1155 aSig = - ( ( bSig>>2 ) * q );
1156 expDiff -= 30;
1157 }
1158 expDiff += 32;
1159 if ( 0 < expDiff ) {
1160 q = estimateDiv64To32( aSig, 0, bSig );
1161 q = ( 2 < q ) ? q - 2 : 0;
1162 q >>= 32 - expDiff;
1163 bSig >>= 2;
1164 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1165 }
1166 else {
1167 aSig >>= 2;
1168 bSig >>= 2;
1169 }
1170 do {
1171 alternateASig = aSig;
1172 ++q;
1173 aSig -= bSig;
1174 } while ( 0 <= (sbits32) aSig );
1175 sigMean = aSig + alternateASig;
1176 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1177 aSig = alternateASig;
1178 }
1179 zSign = ( (sbits32) aSig < 0 );
1180 if ( zSign ) aSig = - aSig;
1181 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1182
1183 }
1184 #endif
1185
1186 #ifndef SOFTFLOAT_FOR_GCC
1187 /*
1188 -------------------------------------------------------------------------------
1189 Returns the square root of the single-precision floating-point value `a'.
1190 The operation is performed according to the IEC/IEEE Standard for Binary
1191 Floating-Point Arithmetic.
1192 -------------------------------------------------------------------------------
1193 */
float32_sqrt(float32 a)1194 float32 float32_sqrt( float32 a )
1195 {
1196 flag aSign;
1197 int16 aExp, zExp;
1198 bits32 aSig, zSig, rem0, rem1, term0, term1;
1199
1200 aSig = extractFloat32Frac( a );
1201 aExp = extractFloat32Exp( a );
1202 aSign = extractFloat32Sign( a );
1203 if ( aExp == 0xFF ) {
1204 if ( aSig ) return propagateFloat32NaN( a, 0 );
1205 if ( ! aSign ) return a;
1206 float_raise( float_flag_invalid );
1207 return float32_default_nan;
1208 }
1209 if ( aSign ) {
1210 if ( ( aExp | aSig ) == 0 ) return a;
1211 float_raise( float_flag_invalid );
1212 return float32_default_nan;
1213 }
1214 if ( aExp == 0 ) {
1215 if ( aSig == 0 ) return 0;
1216 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1217 }
1218 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1219 aSig = ( aSig | 0x00800000 )<<8;
1220 zSig = estimateSqrt32( aExp, aSig ) + 2;
1221 if ( ( zSig & 0x7F ) <= 5 ) {
1222 if ( zSig < 2 ) {
1223 zSig = 0x7FFFFFFF;
1224 goto roundAndPack;
1225 }
1226 else {
1227 aSig >>= aExp & 1;
1228 mul32To64( zSig, zSig, &term0, &term1 );
1229 sub64( aSig, 0, term0, term1, &rem0, &rem1 );
1230 while ( (sbits32) rem0 < 0 ) {
1231 --zSig;
1232 shortShift64Left( 0, zSig, 1, &term0, &term1 );
1233 term1 |= 1;
1234 add64( rem0, rem1, term0, term1, &rem0, &rem1 );
1235 }
1236 zSig |= ( ( rem0 | rem1 ) != 0 );
1237 }
1238 }
1239 shift32RightJamming( zSig, 1, &zSig );
1240 roundAndPack:
1241 return roundAndPackFloat32( 0, zExp, zSig );
1242
1243 }
1244 #endif
1245
1246 /*
1247 -------------------------------------------------------------------------------
1248 Returns 1 if the single-precision floating-point value `a' is equal to
1249 the corresponding value `b', and 0 otherwise. The comparison is performed
1250 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1251 -------------------------------------------------------------------------------
1252 */
float32_eq(float32 a,float32 b)1253 flag float32_eq( float32 a, float32 b )
1254 {
1255
1256 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1257 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1258 ) {
1259 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1260 float_raise( float_flag_invalid );
1261 }
1262 return 0;
1263 }
1264 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1265
1266 }
1267
1268 /*
1269 -------------------------------------------------------------------------------
1270 Returns 1 if the single-precision floating-point value `a' is less than
1271 or equal to the corresponding value `b', and 0 otherwise. The comparison
1272 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1273 Arithmetic.
1274 -------------------------------------------------------------------------------
1275 */
float32_le(float32 a,float32 b)1276 flag float32_le( float32 a, float32 b )
1277 {
1278 flag aSign, bSign;
1279
1280 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1281 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1282 ) {
1283 float_raise( float_flag_invalid );
1284 return 0;
1285 }
1286 aSign = extractFloat32Sign( a );
1287 bSign = extractFloat32Sign( b );
1288 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1289 return ( a == b ) || ( aSign ^ ( a < b ) );
1290
1291 }
1292
1293 /*
1294 -------------------------------------------------------------------------------
1295 Returns 1 if the single-precision floating-point value `a' is less than
1296 the corresponding value `b', and 0 otherwise. The comparison is performed
1297 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1298 -------------------------------------------------------------------------------
1299 */
float32_lt(float32 a,float32 b)1300 flag float32_lt( float32 a, float32 b )
1301 {
1302 flag aSign, bSign;
1303
1304 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1305 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1306 ) {
1307 float_raise( float_flag_invalid );
1308 return 0;
1309 }
1310 aSign = extractFloat32Sign( a );
1311 bSign = extractFloat32Sign( b );
1312 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1313 return ( a != b ) && ( aSign ^ ( a < b ) );
1314
1315 }
1316
1317 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1318 /*
1319 -------------------------------------------------------------------------------
1320 Returns 1 if the single-precision floating-point value `a' is equal to
1321 the corresponding value `b', and 0 otherwise. The invalid exception is
1322 raised if either operand is a NaN. Otherwise, the comparison is performed
1323 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1324 -------------------------------------------------------------------------------
1325 */
float32_eq_signaling(float32 a,float32 b)1326 flag float32_eq_signaling( float32 a, float32 b )
1327 {
1328
1329 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1330 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1331 ) {
1332 float_raise( float_flag_invalid );
1333 return 0;
1334 }
1335 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
1336
1337 }
1338
1339 /*
1340 -------------------------------------------------------------------------------
1341 Returns 1 if the single-precision floating-point value `a' is less than or
1342 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1343 cause an exception. Otherwise, the comparison is performed according to the
1344 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1345 -------------------------------------------------------------------------------
1346 */
float32_le_quiet(float32 a,float32 b)1347 flag float32_le_quiet( float32 a, float32 b )
1348 {
1349 flag aSign, bSign;
1350 int16 aExp, bExp;
1351
1352 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1353 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1354 ) {
1355 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1356 float_raise( float_flag_invalid );
1357 }
1358 return 0;
1359 }
1360 aSign = extractFloat32Sign( a );
1361 bSign = extractFloat32Sign( b );
1362 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
1363 return ( a == b ) || ( aSign ^ ( a < b ) );
1364
1365 }
1366
1367 /*
1368 -------------------------------------------------------------------------------
1369 Returns 1 if the single-precision floating-point value `a' is less than
1370 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1371 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1372 Standard for Binary Floating-Point Arithmetic.
1373 -------------------------------------------------------------------------------
1374 */
float32_lt_quiet(float32 a,float32 b)1375 flag float32_lt_quiet( float32 a, float32 b )
1376 {
1377 flag aSign, bSign;
1378
1379 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
1380 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
1381 ) {
1382 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
1383 float_raise( float_flag_invalid );
1384 }
1385 return 0;
1386 }
1387 aSign = extractFloat32Sign( a );
1388 bSign = extractFloat32Sign( b );
1389 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
1390 return ( a != b ) && ( aSign ^ ( a < b ) );
1391
1392 }
1393 #endif /* !SOFTFLOAT_FOR_GCC */
1394
1395 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1396 /*
1397 -------------------------------------------------------------------------------
1398 Returns the result of converting the double-precision floating-point value
1399 `a' to the 32-bit two's complement integer format. The conversion is
1400 performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 Arithmetic---which means in particular that the conversion is rounded
1402 according to the current rounding mode. If `a' is a NaN, the largest
1403 positive integer is returned. Otherwise, if the conversion overflows, the
1404 largest integer with the same sign as `a' is returned.
1405 -------------------------------------------------------------------------------
1406 */
float64_to_int32(float64 a)1407 int32 float64_to_int32( float64 a )
1408 {
1409 flag aSign;
1410 int16 aExp, shiftCount;
1411 bits32 aSig0, aSig1, absZ, aSigExtra;
1412 int32 z;
1413 int8 roundingMode;
1414
1415 aSig1 = extractFloat64Frac1( a );
1416 aSig0 = extractFloat64Frac0( a );
1417 aExp = extractFloat64Exp( a );
1418 aSign = extractFloat64Sign( a );
1419 shiftCount = aExp - 0x413;
1420 if ( 0 <= shiftCount ) {
1421 if ( 0x41E < aExp ) {
1422 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1423 goto invalid;
1424 }
1425 shortShift64Left(
1426 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1427 if ( 0x80000000 < absZ ) goto invalid;
1428 }
1429 else {
1430 aSig1 = ( aSig1 != 0 );
1431 if ( aExp < 0x3FE ) {
1432 aSigExtra = aExp | aSig0 | aSig1;
1433 absZ = 0;
1434 }
1435 else {
1436 aSig0 |= 0x00100000;
1437 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1438 absZ = aSig0>>( - shiftCount );
1439 }
1440 }
1441 roundingMode = float_rounding_mode;
1442 if ( roundingMode == float_round_nearest_even ) {
1443 if ( (sbits32) aSigExtra < 0 ) {
1444 ++absZ;
1445 if ( (bits32) ( aSigExtra<<1 ) == 0 ) absZ &= ~1;
1446 }
1447 z = aSign ? - absZ : absZ;
1448 }
1449 else {
1450 aSigExtra = ( aSigExtra != 0 );
1451 if ( aSign ) {
1452 z = - ( absZ
1453 + ( ( roundingMode == float_round_down ) & aSigExtra ) );
1454 }
1455 else {
1456 z = absZ + ( ( roundingMode == float_round_up ) & aSigExtra );
1457 }
1458 }
1459 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1460 invalid:
1461 float_raise( float_flag_invalid );
1462 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1463 }
1464 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1465 return z;
1466
1467 }
1468 #endif /* !SOFTFLOAT_FOR_GCC */
1469
1470 /*
1471 -------------------------------------------------------------------------------
1472 Returns the result of converting the double-precision floating-point value
1473 `a' to the 32-bit two's complement integer format. The conversion is
1474 performed according to the IEC/IEEE Standard for Binary Floating-Point
1475 Arithmetic, except that the conversion is always rounded toward zero.
1476 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1477 the conversion overflows, the largest integer with the same sign as `a' is
1478 returned.
1479 -------------------------------------------------------------------------------
1480 */
float64_to_int32_round_to_zero(float64 a)1481 int32 float64_to_int32_round_to_zero( float64 a )
1482 {
1483 flag aSign;
1484 int16 aExp, shiftCount;
1485 bits32 aSig0, aSig1, absZ, aSigExtra;
1486 int32 z;
1487
1488 aSig1 = extractFloat64Frac1( a );
1489 aSig0 = extractFloat64Frac0( a );
1490 aExp = extractFloat64Exp( a );
1491 aSign = extractFloat64Sign( a );
1492 shiftCount = aExp - 0x413;
1493 if ( 0 <= shiftCount ) {
1494 if ( 0x41E < aExp ) {
1495 if ( ( aExp == 0x7FF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1496 goto invalid;
1497 }
1498 shortShift64Left(
1499 aSig0 | 0x00100000, aSig1, shiftCount, &absZ, &aSigExtra );
1500 }
1501 else {
1502 if ( aExp < 0x3FF ) {
1503 if ( aExp | aSig0 | aSig1 ) {
1504 float_exception_flags |= float_flag_inexact;
1505 }
1506 return 0;
1507 }
1508 aSig0 |= 0x00100000;
1509 aSigExtra = ( aSig0<<( shiftCount & 31 ) ) | aSig1;
1510 absZ = aSig0>>( - shiftCount );
1511 }
1512 z = aSign ? - absZ : absZ;
1513 if ( ( aSign ^ ( z < 0 ) ) && z ) {
1514 invalid:
1515 float_raise( float_flag_invalid );
1516 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1517 }
1518 if ( aSigExtra ) float_exception_flags |= float_flag_inexact;
1519 return z;
1520
1521 }
1522
1523 /*
1524 -------------------------------------------------------------------------------
1525 Returns the result of converting the double-precision floating-point value
1526 `a' to the single-precision floating-point format. The conversion is
1527 performed according to the IEC/IEEE Standard for Binary Floating-Point
1528 Arithmetic.
1529 -------------------------------------------------------------------------------
1530 */
float64_to_float32(float64 a)1531 float32 float64_to_float32( float64 a )
1532 {
1533 flag aSign;
1534 int16 aExp;
1535 bits32 aSig0, aSig1, zSig;
1536 bits32 allZero;
1537
1538 aSig1 = extractFloat64Frac1( a );
1539 aSig0 = extractFloat64Frac0( a );
1540 aExp = extractFloat64Exp( a );
1541 aSign = extractFloat64Sign( a );
1542 if ( aExp == 0x7FF ) {
1543 if ( aSig0 | aSig1 ) {
1544 return commonNaNToFloat32( float64ToCommonNaN( a ) );
1545 }
1546 return packFloat32( aSign, 0xFF, 0 );
1547 }
1548 shift64RightJamming( aSig0, aSig1, 22, &allZero, &zSig );
1549 if ( aExp ) zSig |= 0x40000000;
1550 return roundAndPackFloat32( aSign, aExp - 0x381, zSig );
1551
1552 }
1553
1554 #ifndef SOFTFLOAT_FOR_GCC
1555 /*
1556 -------------------------------------------------------------------------------
1557 Rounds the double-precision floating-point value `a' to an integer,
1558 and returns the result as a double-precision floating-point value. The
1559 operation is performed according to the IEC/IEEE Standard for Binary
1560 Floating-Point Arithmetic.
1561 -------------------------------------------------------------------------------
1562 */
float64_round_to_int(float64 a)1563 float64 float64_round_to_int( float64 a )
1564 {
1565 flag aSign;
1566 int16 aExp;
1567 bits32 lastBitMask, roundBitsMask;
1568 int8 roundingMode;
1569 float64 z;
1570
1571 aExp = extractFloat64Exp( a );
1572 if ( 0x413 <= aExp ) {
1573 if ( 0x433 <= aExp ) {
1574 if ( ( aExp == 0x7FF )
1575 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) ) {
1576 return propagateFloat64NaN( a, a );
1577 }
1578 return a;
1579 }
1580 lastBitMask = 1;
1581 lastBitMask = ( lastBitMask<<( 0x432 - aExp ) )<<1;
1582 roundBitsMask = lastBitMask - 1;
1583 z = a;
1584 roundingMode = float_rounding_mode;
1585 if ( roundingMode == float_round_nearest_even ) {
1586 if ( lastBitMask ) {
1587 add64( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
1588 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1589 }
1590 else {
1591 if ( (sbits32) z.low < 0 ) {
1592 ++z.high;
1593 if ( (bits32) ( z.low<<1 ) == 0 ) z.high &= ~1;
1594 }
1595 }
1596 }
1597 else if ( roundingMode != float_round_to_zero ) {
1598 if ( extractFloat64Sign( z )
1599 ^ ( roundingMode == float_round_up ) ) {
1600 add64( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
1601 }
1602 }
1603 z.low &= ~ roundBitsMask;
1604 }
1605 else {
1606 if ( aExp <= 0x3FE ) {
1607 if ( ( ( (bits32) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
1608 float_exception_flags |= float_flag_inexact;
1609 aSign = extractFloat64Sign( a );
1610 switch ( float_rounding_mode ) {
1611 case float_round_nearest_even:
1612 if ( ( aExp == 0x3FE )
1613 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) )
1614 ) {
1615 return packFloat64( aSign, 0x3FF, 0, 0 );
1616 }
1617 break;
1618 case float_round_down:
1619 return
1620 aSign ? packFloat64( 1, 0x3FF, 0, 0 )
1621 : packFloat64( 0, 0, 0, 0 );
1622 case float_round_up:
1623 return
1624 aSign ? packFloat64( 1, 0, 0, 0 )
1625 : packFloat64( 0, 0x3FF, 0, 0 );
1626 }
1627 return packFloat64( aSign, 0, 0, 0 );
1628 }
1629 lastBitMask = 1;
1630 lastBitMask <<= 0x413 - aExp;
1631 roundBitsMask = lastBitMask - 1;
1632 z.low = 0;
1633 z.high = a.high;
1634 roundingMode = float_rounding_mode;
1635 if ( roundingMode == float_round_nearest_even ) {
1636 z.high += lastBitMask>>1;
1637 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
1638 z.high &= ~ lastBitMask;
1639 }
1640 }
1641 else if ( roundingMode != float_round_to_zero ) {
1642 if ( extractFloat64Sign( z )
1643 ^ ( roundingMode == float_round_up ) ) {
1644 z.high |= ( a.low != 0 );
1645 z.high += roundBitsMask;
1646 }
1647 }
1648 z.high &= ~ roundBitsMask;
1649 }
1650 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
1651 float_exception_flags |= float_flag_inexact;
1652 }
1653 return z;
1654
1655 }
1656 #endif
1657
1658 /*
1659 -------------------------------------------------------------------------------
1660 Returns the result of adding the absolute values of the double-precision
1661 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1662 before being returned. `zSign' is ignored if the result is a NaN.
1663 The addition is performed according to the IEC/IEEE Standard for Binary
1664 Floating-Point Arithmetic.
1665 -------------------------------------------------------------------------------
1666 */
addFloat64Sigs(float64 a,float64 b,flag zSign)1667 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
1668 {
1669 int16 aExp, bExp, zExp;
1670 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1671 int16 expDiff;
1672
1673 aSig1 = extractFloat64Frac1( a );
1674 aSig0 = extractFloat64Frac0( a );
1675 aExp = extractFloat64Exp( a );
1676 bSig1 = extractFloat64Frac1( b );
1677 bSig0 = extractFloat64Frac0( b );
1678 bExp = extractFloat64Exp( b );
1679 expDiff = aExp - bExp;
1680 if ( 0 < expDiff ) {
1681 if ( aExp == 0x7FF ) {
1682 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1683 return a;
1684 }
1685 if ( bExp == 0 ) {
1686 --expDiff;
1687 }
1688 else {
1689 bSig0 |= 0x00100000;
1690 }
1691 shift64ExtraRightJamming(
1692 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
1693 zExp = aExp;
1694 }
1695 else if ( expDiff < 0 ) {
1696 if ( bExp == 0x7FF ) {
1697 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1698 return packFloat64( zSign, 0x7FF, 0, 0 );
1699 }
1700 if ( aExp == 0 ) {
1701 ++expDiff;
1702 }
1703 else {
1704 aSig0 |= 0x00100000;
1705 }
1706 shift64ExtraRightJamming(
1707 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
1708 zExp = bExp;
1709 }
1710 else {
1711 if ( aExp == 0x7FF ) {
1712 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1713 return propagateFloat64NaN( a, b );
1714 }
1715 return a;
1716 }
1717 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1718 if ( aExp == 0 ) return packFloat64( zSign, 0, zSig0, zSig1 );
1719 zSig2 = 0;
1720 zSig0 |= 0x00200000;
1721 zExp = aExp;
1722 goto shiftRight1;
1723 }
1724 aSig0 |= 0x00100000;
1725 add64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1726 --zExp;
1727 if ( zSig0 < 0x00200000 ) goto roundAndPack;
1728 ++zExp;
1729 shiftRight1:
1730 shift64ExtraRightJamming( zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1731 roundAndPack:
1732 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1733
1734 }
1735
1736 /*
1737 -------------------------------------------------------------------------------
1738 Returns the result of subtracting the absolute values of the double-
1739 precision floating-point values `a' and `b'. If `zSign' is 1, the
1740 difference is negated before being returned. `zSign' is ignored if the
1741 result is a NaN. The subtraction is performed according to the IEC/IEEE
1742 Standard for Binary Floating-Point Arithmetic.
1743 -------------------------------------------------------------------------------
1744 */
subFloat64Sigs(float64 a,float64 b,flag zSign)1745 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
1746 {
1747 int16 aExp, bExp, zExp;
1748 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
1749 int16 expDiff;
1750
1751 aSig1 = extractFloat64Frac1( a );
1752 aSig0 = extractFloat64Frac0( a );
1753 aExp = extractFloat64Exp( a );
1754 bSig1 = extractFloat64Frac1( b );
1755 bSig0 = extractFloat64Frac0( b );
1756 bExp = extractFloat64Exp( b );
1757 expDiff = aExp - bExp;
1758 shortShift64Left( aSig0, aSig1, 10, &aSig0, &aSig1 );
1759 shortShift64Left( bSig0, bSig1, 10, &bSig0, &bSig1 );
1760 if ( 0 < expDiff ) goto aExpBigger;
1761 if ( expDiff < 0 ) goto bExpBigger;
1762 if ( aExp == 0x7FF ) {
1763 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
1764 return propagateFloat64NaN( a, b );
1765 }
1766 float_raise( float_flag_invalid );
1767 return float64_default_nan;
1768 }
1769 if ( aExp == 0 ) {
1770 aExp = 1;
1771 bExp = 1;
1772 }
1773 if ( bSig0 < aSig0 ) goto aBigger;
1774 if ( aSig0 < bSig0 ) goto bBigger;
1775 if ( bSig1 < aSig1 ) goto aBigger;
1776 if ( aSig1 < bSig1 ) goto bBigger;
1777 return packFloat64( float_rounding_mode == float_round_down, 0, 0, 0 );
1778 bExpBigger:
1779 if ( bExp == 0x7FF ) {
1780 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1781 return packFloat64( zSign ^ 1, 0x7FF, 0, 0 );
1782 }
1783 if ( aExp == 0 ) {
1784 ++expDiff;
1785 }
1786 else {
1787 aSig0 |= 0x40000000;
1788 }
1789 shift64RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
1790 bSig0 |= 0x40000000;
1791 bBigger:
1792 sub64( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
1793 zExp = bExp;
1794 zSign ^= 1;
1795 goto normalizeRoundAndPack;
1796 aExpBigger:
1797 if ( aExp == 0x7FF ) {
1798 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1799 return a;
1800 }
1801 if ( bExp == 0 ) {
1802 --expDiff;
1803 }
1804 else {
1805 bSig0 |= 0x40000000;
1806 }
1807 shift64RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
1808 aSig0 |= 0x40000000;
1809 aBigger:
1810 sub64( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
1811 zExp = aExp;
1812 normalizeRoundAndPack:
1813 --zExp;
1814 return normalizeRoundAndPackFloat64( zSign, zExp - 10, zSig0, zSig1 );
1815
1816 }
1817
1818 /*
1819 -------------------------------------------------------------------------------
1820 Returns the result of adding the double-precision floating-point values `a'
1821 and `b'. The operation is performed according to the IEC/IEEE Standard for
1822 Binary Floating-Point Arithmetic.
1823 -------------------------------------------------------------------------------
1824 */
float64_add(float64 a,float64 b)1825 float64 float64_add( float64 a, float64 b )
1826 {
1827 flag aSign, bSign;
1828
1829 aSign = extractFloat64Sign( a );
1830 bSign = extractFloat64Sign( b );
1831 if ( aSign == bSign ) {
1832 return addFloat64Sigs( a, b, aSign );
1833 }
1834 else {
1835 return subFloat64Sigs( a, b, aSign );
1836 }
1837
1838 }
1839
1840 /*
1841 -------------------------------------------------------------------------------
1842 Returns the result of subtracting the double-precision floating-point values
1843 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1844 for Binary Floating-Point Arithmetic.
1845 -------------------------------------------------------------------------------
1846 */
float64_sub(float64 a,float64 b)1847 float64 float64_sub( float64 a, float64 b )
1848 {
1849 flag aSign, bSign;
1850
1851 aSign = extractFloat64Sign( a );
1852 bSign = extractFloat64Sign( b );
1853 if ( aSign == bSign ) {
1854 return subFloat64Sigs( a, b, aSign );
1855 }
1856 else {
1857 return addFloat64Sigs( a, b, aSign );
1858 }
1859
1860 }
1861
1862 /*
1863 -------------------------------------------------------------------------------
1864 Returns the result of multiplying the double-precision floating-point values
1865 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1866 for Binary Floating-Point Arithmetic.
1867 -------------------------------------------------------------------------------
1868 */
float64_mul(float64 a,float64 b)1869 float64 float64_mul( float64 a, float64 b )
1870 {
1871 flag aSign, bSign, zSign;
1872 int16 aExp, bExp, zExp;
1873 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
1874
1875 aSig1 = extractFloat64Frac1( a );
1876 aSig0 = extractFloat64Frac0( a );
1877 aExp = extractFloat64Exp( a );
1878 aSign = extractFloat64Sign( a );
1879 bSig1 = extractFloat64Frac1( b );
1880 bSig0 = extractFloat64Frac0( b );
1881 bExp = extractFloat64Exp( b );
1882 bSign = extractFloat64Sign( b );
1883 zSign = aSign ^ bSign;
1884 if ( aExp == 0x7FF ) {
1885 if ( ( aSig0 | aSig1 )
1886 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
1887 return propagateFloat64NaN( a, b );
1888 }
1889 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
1890 return packFloat64( zSign, 0x7FF, 0, 0 );
1891 }
1892 if ( bExp == 0x7FF ) {
1893 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1894 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1895 invalid:
1896 float_raise( float_flag_invalid );
1897 return float64_default_nan;
1898 }
1899 return packFloat64( zSign, 0x7FF, 0, 0 );
1900 }
1901 if ( aExp == 0 ) {
1902 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1903 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1904 }
1905 if ( bExp == 0 ) {
1906 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1907 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1908 }
1909 zExp = aExp + bExp - 0x400;
1910 aSig0 |= 0x00100000;
1911 shortShift64Left( bSig0, bSig1, 12, &bSig0, &bSig1 );
1912 mul64To128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
1913 add64( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
1914 zSig2 |= ( zSig3 != 0 );
1915 if ( 0x00200000 <= zSig0 ) {
1916 shift64ExtraRightJamming(
1917 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
1918 ++zExp;
1919 }
1920 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
1921
1922 }
1923
1924 /*
1925 -------------------------------------------------------------------------------
1926 Returns the result of dividing the double-precision floating-point value `a'
1927 by the corresponding value `b'. The operation is performed according to the
1928 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1929 -------------------------------------------------------------------------------
1930 */
float64_div(float64 a,float64 b)1931 float64 float64_div( float64 a, float64 b )
1932 {
1933 flag aSign, bSign, zSign;
1934 int16 aExp, bExp, zExp;
1935 bits32 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
1936 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1937
1938 aSig1 = extractFloat64Frac1( a );
1939 aSig0 = extractFloat64Frac0( a );
1940 aExp = extractFloat64Exp( a );
1941 aSign = extractFloat64Sign( a );
1942 bSig1 = extractFloat64Frac1( b );
1943 bSig0 = extractFloat64Frac0( b );
1944 bExp = extractFloat64Exp( b );
1945 bSign = extractFloat64Sign( b );
1946 zSign = aSign ^ bSign;
1947 if ( aExp == 0x7FF ) {
1948 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, b );
1949 if ( bExp == 0x7FF ) {
1950 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1951 goto invalid;
1952 }
1953 return packFloat64( zSign, 0x7FF, 0, 0 );
1954 }
1955 if ( bExp == 0x7FF ) {
1956 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
1957 return packFloat64( zSign, 0, 0, 0 );
1958 }
1959 if ( bExp == 0 ) {
1960 if ( ( bSig0 | bSig1 ) == 0 ) {
1961 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
1962 invalid:
1963 float_raise( float_flag_invalid );
1964 return float64_default_nan;
1965 }
1966 float_raise( float_flag_divbyzero );
1967 return packFloat64( zSign, 0x7FF, 0, 0 );
1968 }
1969 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
1970 }
1971 if ( aExp == 0 ) {
1972 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( zSign, 0, 0, 0 );
1973 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
1974 }
1975 zExp = aExp - bExp + 0x3FD;
1976 shortShift64Left( aSig0 | 0x00100000, aSig1, 11, &aSig0, &aSig1 );
1977 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
1978 if ( le64( bSig0, bSig1, aSig0, aSig1 ) ) {
1979 shift64Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
1980 ++zExp;
1981 }
1982 zSig0 = estimateDiv64To32( aSig0, aSig1, bSig0 );
1983 mul64By32To96( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
1984 sub96( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
1985 while ( (sbits32) rem0 < 0 ) {
1986 --zSig0;
1987 add96( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
1988 }
1989 zSig1 = estimateDiv64To32( rem1, rem2, bSig0 );
1990 if ( ( zSig1 & 0x3FF ) <= 4 ) {
1991 mul64By32To96( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
1992 sub96( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
1993 while ( (sbits32) rem1 < 0 ) {
1994 --zSig1;
1995 add96( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
1996 }
1997 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1998 }
1999 shift64ExtraRightJamming( zSig0, zSig1, 0, 11, &zSig0, &zSig1, &zSig2 );
2000 return roundAndPackFloat64( zSign, zExp, zSig0, zSig1, zSig2 );
2001
2002 }
2003
2004 #ifndef SOFTFLOAT_FOR_GCC
2005 /*
2006 -------------------------------------------------------------------------------
2007 Returns the remainder of the double-precision floating-point value `a'
2008 with respect to the corresponding value `b'. The operation is performed
2009 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2010 -------------------------------------------------------------------------------
2011 */
float64_rem(float64 a,float64 b)2012 float64 float64_rem( float64 a, float64 b )
2013 {
2014 flag aSign, bSign, zSign;
2015 int16 aExp, bExp, expDiff;
2016 bits32 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2017 bits32 allZero, alternateASig0, alternateASig1, sigMean1;
2018 sbits32 sigMean0;
2019 float64 z;
2020
2021 aSig1 = extractFloat64Frac1( a );
2022 aSig0 = extractFloat64Frac0( a );
2023 aExp = extractFloat64Exp( a );
2024 aSign = extractFloat64Sign( a );
2025 bSig1 = extractFloat64Frac1( b );
2026 bSig0 = extractFloat64Frac0( b );
2027 bExp = extractFloat64Exp( b );
2028 bSign = extractFloat64Sign( b );
2029 if ( aExp == 0x7FF ) {
2030 if ( ( aSig0 | aSig1 )
2031 || ( ( bExp == 0x7FF ) && ( bSig0 | bSig1 ) ) ) {
2032 return propagateFloat64NaN( a, b );
2033 }
2034 goto invalid;
2035 }
2036 if ( bExp == 0x7FF ) {
2037 if ( bSig0 | bSig1 ) return propagateFloat64NaN( a, b );
2038 return a;
2039 }
2040 if ( bExp == 0 ) {
2041 if ( ( bSig0 | bSig1 ) == 0 ) {
2042 invalid:
2043 float_raise( float_flag_invalid );
2044 return float64_default_nan;
2045 }
2046 normalizeFloat64Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2047 }
2048 if ( aExp == 0 ) {
2049 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2050 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2051 }
2052 expDiff = aExp - bExp;
2053 if ( expDiff < -1 ) return a;
2054 shortShift64Left(
2055 aSig0 | 0x00100000, aSig1, 11 - ( expDiff < 0 ), &aSig0, &aSig1 );
2056 shortShift64Left( bSig0 | 0x00100000, bSig1, 11, &bSig0, &bSig1 );
2057 q = le64( bSig0, bSig1, aSig0, aSig1 );
2058 if ( q ) sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2059 expDiff -= 32;
2060 while ( 0 < expDiff ) {
2061 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2062 q = ( 4 < q ) ? q - 4 : 0;
2063 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2064 shortShift96Left( term0, term1, term2, 29, &term1, &term2, &allZero );
2065 shortShift64Left( aSig0, aSig1, 29, &aSig0, &allZero );
2066 sub64( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2067 expDiff -= 29;
2068 }
2069 if ( -32 < expDiff ) {
2070 q = estimateDiv64To32( aSig0, aSig1, bSig0 );
2071 q = ( 4 < q ) ? q - 4 : 0;
2072 q >>= - expDiff;
2073 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2074 expDiff += 24;
2075 if ( expDiff < 0 ) {
2076 shift64Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2077 }
2078 else {
2079 shortShift64Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2080 }
2081 mul64By32To96( bSig0, bSig1, q, &term0, &term1, &term2 );
2082 sub64( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2083 }
2084 else {
2085 shift64Right( aSig0, aSig1, 8, &aSig0, &aSig1 );
2086 shift64Right( bSig0, bSig1, 8, &bSig0, &bSig1 );
2087 }
2088 do {
2089 alternateASig0 = aSig0;
2090 alternateASig1 = aSig1;
2091 ++q;
2092 sub64( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2093 } while ( 0 <= (sbits32) aSig0 );
2094 add64(
2095 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2096 if ( ( sigMean0 < 0 )
2097 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2098 aSig0 = alternateASig0;
2099 aSig1 = alternateASig1;
2100 }
2101 zSign = ( (sbits32) aSig0 < 0 );
2102 if ( zSign ) sub64( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2103 return
2104 normalizeRoundAndPackFloat64( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2105
2106 }
2107 #endif
2108
2109 #ifndef SOFTFLOAT_FOR_GCC
2110 /*
2111 -------------------------------------------------------------------------------
2112 Returns the square root of the double-precision floating-point value `a'.
2113 The operation is performed according to the IEC/IEEE Standard for Binary
2114 Floating-Point Arithmetic.
2115 -------------------------------------------------------------------------------
2116 */
float64_sqrt(float64 a)2117 float64 float64_sqrt( float64 a )
2118 {
2119 flag aSign;
2120 int16 aExp, zExp;
2121 bits32 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2122 bits32 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2123 float64 z;
2124
2125 aSig1 = extractFloat64Frac1( a );
2126 aSig0 = extractFloat64Frac0( a );
2127 aExp = extractFloat64Exp( a );
2128 aSign = extractFloat64Sign( a );
2129 if ( aExp == 0x7FF ) {
2130 if ( aSig0 | aSig1 ) return propagateFloat64NaN( a, a );
2131 if ( ! aSign ) return a;
2132 goto invalid;
2133 }
2134 if ( aSign ) {
2135 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2136 invalid:
2137 float_raise( float_flag_invalid );
2138 return float64_default_nan;
2139 }
2140 if ( aExp == 0 ) {
2141 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat64( 0, 0, 0, 0 );
2142 normalizeFloat64Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2143 }
2144 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2145 aSig0 |= 0x00100000;
2146 shortShift64Left( aSig0, aSig1, 11, &term0, &term1 );
2147 zSig0 = ( estimateSqrt32( aExp, term0 )>>1 ) + 1;
2148 if ( zSig0 == 0 ) zSig0 = 0x7FFFFFFF;
2149 doubleZSig0 = zSig0 + zSig0;
2150 shortShift64Left( aSig0, aSig1, 9 - ( aExp & 1 ), &aSig0, &aSig1 );
2151 mul32To64( zSig0, zSig0, &term0, &term1 );
2152 sub64( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2153 while ( (sbits32) rem0 < 0 ) {
2154 --zSig0;
2155 doubleZSig0 -= 2;
2156 add64( rem0, rem1, 0, doubleZSig0 | 1, &rem0, &rem1 );
2157 }
2158 zSig1 = estimateDiv64To32( rem1, 0, doubleZSig0 );
2159 if ( ( zSig1 & 0x1FF ) <= 5 ) {
2160 if ( zSig1 == 0 ) zSig1 = 1;
2161 mul32To64( doubleZSig0, zSig1, &term1, &term2 );
2162 sub64( rem1, 0, term1, term2, &rem1, &rem2 );
2163 mul32To64( zSig1, zSig1, &term2, &term3 );
2164 sub96( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2165 while ( (sbits32) rem1 < 0 ) {
2166 --zSig1;
2167 shortShift64Left( 0, zSig1, 1, &term2, &term3 );
2168 term3 |= 1;
2169 term2 |= doubleZSig0;
2170 add96( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2171 }
2172 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2173 }
2174 shift64ExtraRightJamming( zSig0, zSig1, 0, 10, &zSig0, &zSig1, &zSig2 );
2175 return roundAndPackFloat64( 0, zExp, zSig0, zSig1, zSig2 );
2176
2177 }
2178 #endif
2179
2180 /*
2181 -------------------------------------------------------------------------------
2182 Returns 1 if the double-precision floating-point value `a' is equal to
2183 the corresponding value `b', and 0 otherwise. The comparison is performed
2184 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2185 -------------------------------------------------------------------------------
2186 */
float64_eq(float64 a,float64 b)2187 flag float64_eq( float64 a, float64 b )
2188 {
2189
2190 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2191 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2192 || ( ( extractFloat64Exp( b ) == 0x7FF )
2193 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2194 ) {
2195 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2196 float_raise( float_flag_invalid );
2197 }
2198 return 0;
2199 }
2200 return ( a == b ) ||
2201 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
2202
2203 }
2204
2205 /*
2206 -------------------------------------------------------------------------------
2207 Returns 1 if the double-precision floating-point value `a' is less than
2208 or equal to the corresponding value `b', and 0 otherwise. The comparison
2209 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2210 Arithmetic.
2211 -------------------------------------------------------------------------------
2212 */
float64_le(float64 a,float64 b)2213 flag float64_le( float64 a, float64 b )
2214 {
2215 flag aSign, bSign;
2216
2217 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2218 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2219 || ( ( extractFloat64Exp( b ) == 0x7FF )
2220 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2221 ) {
2222 float_raise( float_flag_invalid );
2223 return 0;
2224 }
2225 aSign = extractFloat64Sign( a );
2226 bSign = extractFloat64Sign( b );
2227 if ( aSign != bSign )
2228 return aSign ||
2229 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
2230 0 );
2231 return ( a == b ) ||
2232 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2233 }
2234
2235 /*
2236 -------------------------------------------------------------------------------
2237 Returns 1 if the double-precision floating-point value `a' is less than
2238 the corresponding value `b', and 0 otherwise. The comparison is performed
2239 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2240 -------------------------------------------------------------------------------
2241 */
float64_lt(float64 a,float64 b)2242 flag float64_lt( float64 a, float64 b )
2243 {
2244 flag aSign, bSign;
2245
2246 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2247 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2248 || ( ( extractFloat64Exp( b ) == 0x7FF )
2249 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2250 ) {
2251 float_raise( float_flag_invalid );
2252 return 0;
2253 }
2254 aSign = extractFloat64Sign( a );
2255 bSign = extractFloat64Sign( b );
2256 if ( aSign != bSign )
2257 return aSign &&
2258 ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
2259 0 );
2260 return ( a != b ) &&
2261 ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
2262
2263 }
2264
2265 #ifndef SOFTFLOAT_FOR_GCC
2266 /*
2267 -------------------------------------------------------------------------------
2268 Returns 1 if the double-precision floating-point value `a' is equal to
2269 the corresponding value `b', and 0 otherwise. The invalid exception is
2270 raised if either operand is a NaN. Otherwise, the comparison is performed
2271 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2272 -------------------------------------------------------------------------------
2273 */
float64_eq_signaling(float64 a,float64 b)2274 flag float64_eq_signaling( float64 a, float64 b )
2275 {
2276
2277 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2278 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2279 || ( ( extractFloat64Exp( b ) == 0x7FF )
2280 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2281 ) {
2282 float_raise( float_flag_invalid );
2283 return 0;
2284 }
2285 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2286
2287 }
2288
2289 /*
2290 -------------------------------------------------------------------------------
2291 Returns 1 if the double-precision floating-point value `a' is less than or
2292 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2293 cause an exception. Otherwise, the comparison is performed according to the
2294 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 -------------------------------------------------------------------------------
2296 */
float64_le_quiet(float64 a,float64 b)2297 flag float64_le_quiet( float64 a, float64 b )
2298 {
2299 flag aSign, bSign;
2300
2301 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2302 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2303 || ( ( extractFloat64Exp( b ) == 0x7FF )
2304 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2305 ) {
2306 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2307 float_raise( float_flag_invalid );
2308 }
2309 return 0;
2310 }
2311 aSign = extractFloat64Sign( a );
2312 bSign = extractFloat64Sign( b );
2313 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2314 return ( a == b ) || ( aSign ^ ( a < b ) );
2315
2316 }
2317
2318 /*
2319 -------------------------------------------------------------------------------
2320 Returns 1 if the double-precision floating-point value `a' is less than
2321 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2322 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2323 Standard for Binary Floating-Point Arithmetic.
2324 -------------------------------------------------------------------------------
2325 */
float64_lt_quiet(float64 a,float64 b)2326 flag float64_lt_quiet( float64 a, float64 b )
2327 {
2328 flag aSign, bSign;
2329
2330 if ( ( ( extractFloat64Exp( a ) == 0x7FF )
2331 && ( extractFloat64Frac0( a ) | extractFloat64Frac1( a ) ) )
2332 || ( ( extractFloat64Exp( b ) == 0x7FF )
2333 && ( extractFloat64Frac0( b ) | extractFloat64Frac1( b ) ) )
2334 ) {
2335 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2336 float_raise( float_flag_invalid );
2337 }
2338 return 0;
2339 }
2340 aSign = extractFloat64Sign( a );
2341 bSign = extractFloat64Sign( b );
2342 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2343 return ( a != b ) && ( aSign ^ ( a < b ) );
2344
2345 }
2346
2347 #endif
2348