webia1/dec64.nasm

## dec64.nasm
; title   dec64.nasm for x64.

; dec64.com
; 2022-09-03
; Public Domain

; No warranty expressed or implied. Use at your own risk. You have been warned.

; This file implements the elementary arithmetic operations for DEC64, a decimal
; floating point type. DEC64 uses 64 bits to represent a number. The low order
; 8 bits contain an exponent that ranges from -127 to 127. The exponent is not
; biased. The exponent -128 is reserved for nan (not a number). The remaining
; 56 bits, including the sign bit, are the coefficient in the range
; -36028797018963968 thru 36028797018963967. The exponent and the coefficient
; are both two's complement signed numbers.
;
; The value of any non-nan DEC64 number is coefficient * (10 ** exponent).
;
; Rounding is to the nearest value. Ties are rounded away from zero. Integer
; division is floored. The result of modulo has the sign of the divisor.
; There is no negative zero.
;
; These operations will produce a result of nan:
;
;   dec64_abs(nan)
;   dec64_ceiling(nan)
;   dec64_floor(nan)
;   dec64_neg(nan)
;   dec64_normal(nan)
;   dec64_signum(nan)
;
; These operations will produce a result of zero for all values of n,
; even if n is nan:
;
;   dec64_divide(0, n)
;   dec64_integer_divide(0, n)
;   dec64_modulo(0, n)
;   dec64_multiply(0, n)
;   dec64_multiply(n, 0)
;
; These operations will produce a result of nan for all values of n
; except zero:
;
;   dec64_divide(n, 0)
;   dec64_divide(n, nan)
;   dec64_integer_divide(n, 0)
;   dec64_integer_divide(n, nan)
;   dec64_modulo(n, 0)
;   dec64_modulo(n, nan)
;   dec64_multiply(n, nan)
;   dec64_multiply(nan, n)
;
; These operations will produce a result of nan for all values of n:
;
;   dec64_add(n, nan)
;   dec64_add(nan, n)
;   dec64_divide(nan, n)
;   dec64_integer_divide(nan, n)
;   dec64_modulo(nan, n)
;   dec64_round(nan, n)
;   dec64_subtract(n, nan)
;   dec64_subtract(nan, n)
;
; This file can be processed with Microsoft's ML64.exe. There might be other
; assemblers that can process this file, but that has not been tested.
;
;       You know what goes great with those defective pentium chips?
;       Defective pentium salsa! (David Letterman)

; All public symbols have a dec64_ prefix. All other symbols are private.

; There are 72057594037927936 possible nan values. Three are reserved:
;
;       DEC64_NULL
;       DEC64_TRUE
;       DEC64_FALSE
;
; The other 72057594037927933 nan values can be used for any purpose,
; including object pointers.

; When these functions return nan, they will always return DEC64_NULL,
; the normal nan.

; The comparison functions will return DEC64_TRUE or DEC64_FALSE.

%define null            8000000000000080h
%define true            8000000000000380h
%define false           8000000000000280h

; Multiplication by a scaled reciprocal can be faster than division.

%define eight_over_ten  -3689348814741910323

global dec64_abs;(number: dec64)
;   returns absolution: dec64

global dec64_add;(augend: dec64, addend: dec64)
;   returns sum: dec64

global dec64_ceiling;(number: dec64)
;   returns integer: dec64

global dec64_coefficient;(number: dec64)
;   returns coefficient: int64

global dec64_divide;(dividend: dec64, divisor: dec64)
;   returns quotient: dec64

global dec64_exponent;(number: dec64)
;   returns exponent: int64

global dec64_floor;(number: dec64)
;   returns integer: dec64

global dec64_integer_divide;(dividend: dec64, divisor: dec64)
;   returns quotient: dec64

global dec64_is_equal;(comparahend: dec64, comparator: dec64)
;   returns comparison: dec64

global dec64_is_false;(boolean: dec64)
;   returns comparison: dec64

global dec64_is_integer;(number: dec64)
;   returns comparison: dec64

global dec64_is_less;(comparahend: dec64, comparator: dec64)
;   returns comparison: dec64

global dec64_is_nan;(number: dec64)
;   returns comparison: dec64

global dec64_is_zero;(number: dec64)
;   returns comparison: dec64

global dec64_modulo;(dividend: dec64, divisor: dec64)
;   returns modulus: dec64

global dec64_multiply;(multiplicand: dec64, multiplier: dec64)
;   returns product: dec64

global dec64_neg;(number: dec64)
;   returns negation: dec64

global dec64_new;(coefficient: int64, exponent: int64)
;   returns number: dec64

global dec64_normal;(number: dec64)
;   returns normalization: dec64

global dec64_round;(number: dec64, place: dec64)
;   returns quantization: dec64

global dec64_signum;(number: dec64)
;   returns signature: dec64

global dec64_subtract;(minuend: dec64, subtrahend: dec64)
;   returns difference: dec64

;  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

; Repair the register names. Over the long and twisted evolution of x86, the
; register names have picked up some weird, inconsistent conventions. We can
; simplify, naming them r0 thru r15, with a _b suffix to indicate the low
; order byte, an _h suffix to indicate the so-called high byte and a _w suffix
; to indicate the lower 16 bits. We would use _d to indicate the lower 32 bits,
; but that was not needed here.

%define r0      rax
%define r1      rcx
%define r2      rdx
%define r6      rsi
%define r7      rdi

%define r0_b    al
%define r1_b    cl
%define r2_b    dl
%define r8_b    r8b
%define r9_b    r9b
%define r10_b   r10b
%define r11_b   r11b

%define r0_h    ah
%define r1_h    ch
%define r2_h    dh

%define r0_w    ax
%define r1_w    cx

%define jr1z    jrcxz

; All of the public functions in this file accept up to two arguments, which
; are passed in registers (either r1, r2 or r7, r6), returning a result in r0.

; Registers r1, r2, r8, r9, r10, and r11 are clobbered. Register r0 is the
; return value. The other registers are not disturbed.

; There is painfully inadequate standardization around x64 calling conventions.
; On Win64, the first two arguments are passed in r1 and r2. On Unix, the first
; two arguments are passed in r7 and r6. We try to hide this behind macros. The
; two systems also have different conventions about which registers may be
; clobbered and which must be preserved. This code lives in the intersection.

; This has not yet been tested on Unix.

%define UNIX   1                ; calling convention: 0 for Windows, 1 for Unix

%macro function_with_one_parameter 0
    %if UNIX
    mov     r1, r7              ;; UNIX
    %endif
    %endmacro

%macro function_with_two_parameters 0
    %if UNIX
    mov     r1, r7              ;; UNIX
    mov     r2, r6              ;; UNIX
    %endif
    %endmacro

%macro call_with_one_parameter 1
    %if UNIX
    mov     r7, r1              ;; UNIX
    %endif
    call    %1
    %endmacro

%macro call_with_two_parameters 1
    %if UNIX
    mov     r7, r1              ;; UNIX
    mov     r6, r2              ;; UNIX
    %endif
    call    %1
    %endmacro

%macro tail_with_one_parameter 1
    %if UNIX
    mov     r7, r1              ;; UNIX
    %endif
    jmp     %1
    %endmacro

%macro tail_with_two_parameters 1
    %if UNIX
    mov     r7, r1              ;; UNIX
    mov     r6, r2              ;; UNIX
    %endif
    jmp     %1
    %endmacro

; There may be a performance benefit in padding programs so that most jump
; destinations are aligned on 16 byte boundaries.

%macro pad 0
    align   16
    %endmacro

;  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_code section .text

power:                              ; the powers of 10

    dq      1                       ; 0
    dq      10                      ; 1
    dq      100                     ; 2
    dq      1000                    ; 3
    dq      10000                   ; 4
    dq      100000                  ; 5
    dq      1000000                 ; 6
    dq      10000000                ; 7
    dq      100000000               ; 8
    dq      1000000000              ; 9
    dq      10000000000             ; 10
    dq      100000000000            ; 11
    dq      1000000000000           ; 12
    dq      10000000000000          ; 13
    dq      100000000000000         ; 14
    dq      1000000000000000        ; 15
    dq      10000000000000000       ; 16
    dq      100000000000000000      ; 17
    dq      1000000000000000000     ; 18
    dq      10000000000000000000    ; 19

;  -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_coefficient: function_with_one_parameter
;(number: dec64) returns coefficient: int64

; Return the coefficient part from a dec64 number.

    mov     r0, r1
    sar     r0, 8
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_exponent: function_with_one_parameter
;(number: dec64) returns exponent: int64

; Return the exponent part, sign extended to 64 bits, from a dec64 number.
; dec64_exponent(nan) returns -128.

    movsx   r0, r1_b                ; r0 is the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_new: function_with_two_parameters
;(coefficient: int64, exponent: int64) returns number: dec64

; Construct a new dec64 number with a coefficient and an exponent.

    mov     r0, r1                  ; r0 is the coefficient
    test    r0, r0                  ; is the coefficient zero?
    jz      return_zero
    mov     r8, r2                  ; r8 is the exponent

; Fall into pack.

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

pack:

; The pack function will combine the coefficient and exponent into a dec64.
; Numbers that are too huge to be contained in this format become nan.
; Numbers that are too tiny to be contained in this format become zero.

; The coefficient is in r0.
; The exponent is in r8.

; If the exponent is greater than 127, then the number is too big.
; But it might still be possible to salvage a value.

    cmp     r8, 127                 ; compare exponent with 127
    jg      pack_decrease           ; it might be possible to decrease it

; If the exponent is too small, or if the coefficient is too large, then some
; division is necessary. The absolute value of the coefficient is off by one
; for the negative because
;    negative_extreme_coefficent = -(extreme_coefficent + 1)

    mov     r10, r0                 ; r10 is the coefficient
    mov     r1, 3602879701896396800 ; the ultimate coefficient * 100
    not     r10                     ; r10 is -coefficient
    xor     r11, r11                ; r11 is zero
    test    r10, r10                ; look at the sign bit
    cmovs   r10, r0                 ; r10 is the absolute value of coefficient
    cmp     r10, r1                 ; compare with the actual coefficient
    jae     pack_large              ; deal with the very large coefficient
    mov     r1, 36028797018963967   ; the ultimate coefficient - 1
    mov     r9, -127                ; r9 is the ultimate exponent
    cmp     r1, r10                 ; compare with the actual coefficient
    adc     r11, 0                  ; add 1 to r11 if 1 digit too big
    mov     r1, 360287970189639679  ; the ultimate coefficient * 10 - 1
    sub     r9, r8                  ; r9 is the difference from actual exponent
    cmp     r1, r10                 ; compare with the actual coefficient
    adc     r11, 0                  ; add 1 to r11 if 2 digits too big
    cmp     r9, r11                 ; which excess is larger?
    cmovl   r9, r11                 ; take the max
    test    r9, r9                  ; if neither was zero
    jnz     pack_increase           ; then increase the exponent by the excess
    shl     r0, 8                   ; shift the exponent into position
    cmovz   r8, r0                  ; if coefficient is zero, also zero the exp
    movzx   r8, r8_b                ; zero out all but the bottom 8 bits of exp
    or      r0, r8                  ; mix in the exponent
    ret                             ; whew
    pad

pack_large:

    mov     r1, r0                  ; r1 is the coefficient
    sar     r1, 63                  ; r1 is -1 if negative, or 0 if positive
    mov     r11, eight_over_ten     ; magic number
    mov     r9, r1                  ; r9 is -1 or 0
    xor     r0, r1                  ; complement the coefficient if negative
    and     r9, 1                   ; r9 is 1 or 0
    add     r0, r9                  ; r0 is absolute value of coefficient
    add     r8, 1                   ; add 1 to the exponent
    mul     r11                     ; multiply abs(coefficient) by magic number
    mov     r0, r2                  ; r0 is the product shift 64 bits
    shr     r0, 3                   ; r0 is divided by 8: abs(coefficient) / 10
    xor     r0, r1                  ; complement coefficient if it was negative
    add     r0, r9                  ; coefficient's sign is restored
    jmp     pack                    ; start over
    pad

pack_increase:

    mov     r10, power
    mov     r10, [r10+r9*8]         ; r10 is 10^r9
    cmp     r9, 20                  ; is the difference more than 20?
    jae     return_zero             ; if so, the result is zero (rare)
    mov     r11, r10                ; r11 is the power of ten
    neg     r11                     ; r11 is the negation of the power of ten
    test    r0, r0                  ; examine the sign of the coefficient
    cmovns  r11, r10                ; r11 has the sign of the coefficient
    sar     r11, 1                  ; r11 is half the signed power of ten
    add     r0, r11                 ; r0 is coefficient plus the rounding fudge
    cqo                             ; sign extend r0 into r2
    idiv    r10                     ; divide by the power of ten
    add     r8, r9                  ; increase the exponent
    jmp     pack                    ; start over
    pad

pack_decrease:

; The exponent is too big. We can attempt to reduce it by scaling back.
; This can salvage values in a small set of cases.

    imul    r0, 10                  ; try multiplying the coefficient by 10
    jo      return_null             ; if it overflows, we failed to salvage
    sub     r8, 1                   ; decrement the exponent
    test    r8, -128                ; is the exponent still over 127?
    jnz     pack_decrease           ; until the exponent is less than 127
    mov     r9, r0                  ; r9 is the coefficient
    sar     r9, 56                  ; r9 is top 8 bits of the coefficient
    adc     r9, 0                   ; add the ninth bit
    jnz     return_null             ; the number is still too large
    shl     r0, 8                   ; shift the exponent into position
    cmovz   r8, r0                  ; if coefficient is zero, also zero exponent
    movzx   r8, r8_b                ; zero out all but bottom 8 bits of exponent
    or      r0, r8                  ; mix in the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_round: function_with_two_parameters
;(number: dec64, place: dec64) returns quantization: dec64

; The place argument indicates at what decimal place to round.
;    -2        nearest cent
;     0        nearest integer
;     3        nearest thousand
;     6        nearest million
;     9        nearest billion

; The place should be between -16 and 16.

    pad

round_begin:

    cmp     r1_b, 128               ; is the number nan?
    jz      return_null
    test    r2_b, r2_b              ; is places already an integer?
    jnz     round_places            ;   nope
    mov     r10, r1                 ; r10 is the number
    mov     r11, r2                 ; r11 is the places
    movsx   r8, r10_b               ; r8 is the current exponent
    mov     r0, r10                 ; r0 is the coefficient
    sar     r11, 8                  ; r11 is place: int64
    sar     r0, 8                   ; r0 is the coefficient of the number
    jz      return_zero             ; if the coefficient is 0, the result is 0
    cmp     r8, r11                 ; compare the exponents
    jge     pack                    ; no rounding required
    mov     r1, r0
    mov     r9, eight_over_ten      ; magic
    neg     r1                      ; r1 is -coefficient
    cmovns  r0, r1                  ; r0 is abs(coefficient)
    pad

round_loop:

; Increment the exponent and divide the coefficient by 10 until the target
; exponent is reached. The division is accomplished by multiplying with a
; scaled reciprocal.

    mul     r9                      ; r2 is the coefficient * 8 / 10
    mov     r0, r2                  ; r0 is the coefficient * 8 / 10
    sar     r0, 3                   ; r0 is the coefficient / 10
    add     r8, 1                   ; increment the exponent
    cmp     r8, r11                 ; compare the exponent with the target
    jne     round_loop              ; loop if the exponent is not at the target

; Round if necessary and return the result.

    shr     r2, 2                   ; isolate the carry bit
    and     r2, 1                   ; r2 is 1 if rounding is needed
    add     r0, r2                  ; r0 is rounded
    mov     r1, r0
    neg     r1
    test    r10, r10                ; was the original number negative?
    cmovs   r0, r1
    jmp     pack                    ; pack it up
    pad

round_places:

; Places does not seem to be an integer. If it is nan, then default to zero.

    cmp     r2_b, 128
    jne     round_normal
    xor     r2, r2
    jmp     round_begin
    pad

round_normal:

    mov     r10, r1                 ; save the number
    mov     r1, r2                  ; pass the place
    call_with_one_parameter dec64_normal
    test    r0_b, r0_b              ; is the number of places a normal integer?
    jnz     return_null
    mov     r1, r10
    mov     r2, r0
    jmp     round_begin

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_add: function_with_two_parameters
;(augend: dec64, addend: dec64) returns sum: dec64

; Add two dec64 numbers together.

; If the two exponents are both zero (which is usually the case for integers)
; we can take the fast path. Since the exponents are both zero, we can simply
; add the numbers together and check for overflow.

    pad

add_begin:

    mov     r0, r1                  ; r0 is the first number
    or      r1_b, r2_b              ; r1_b is the two exponents or'd together
    jnz     add_slow                ; if either exponent is not zero, slow path
    add     r0, r2                  ; add the shifted coefficients together
    jo      add_overflow            ; if there was no overflow, we are done
    ret                             ; no need to pack
    pad

add_overflow:

; If there was an overflow (extremely unlikely) then we must make it fit.
; pack knows how to do that.

    rcr     r0, 1                   ; divide the sum by 2 and repair its sign
    movsx   r8, r1_b                ; r8 is the exponent
    sar     r0, 7                   ; r0 is the coefficient of the sum
    jmp     pack                    ; pack it up
    pad

add_slow:

; The slow path is taken if the two operands do not both have zero exponents.

    mov     r1, r0                  ; restore r1
    cmp     r0_b, 128               ; is the first operand nan?
    je      return_null             ; if nan, get out

; Are the two exponents the same? This will happen often, especially with
; money values.

    cmp     r1_b, r2_b              ; compare the two exponents
    jne     add_slower              ; if not equal, take the slower path

; The exponents match so we may add now. Zero out the exponents so there
; will be no carry into the coefficients when the coefficients are added.
; If the result is zero, then return the normal zero.

    and     r0, -256                ; remove the exponent
    and     r2, -256                ; remove the other exponent
    add     r0, r2                  ; add the shifted coefficients
    jo      add_overflow            ; if it overflows, it must be repaired
    cmovz   r1, r0                  ; if coefficient is zero, exponent is zero
    movzx   r1, r1_b                ; mask the exponent
    or      r0, r1                  ; mix in the exponent
    ret                             ; no need to pack
    pad

add_slower:

; The slower path is taken when neither operand is nan, and their
; exponents are different. Before addition can take place, the exponents
; must be made to match. Swap the numbers if the second exponent is greater
; than the first.

    cmp     r2_b, 128               ; Is the second operand nan?
    je      return_null
    cmp     r1_b, r2_b              ; compare the exponents
    mov     r0, r1                  ; r0 is the first number
    cmovl   r1, r2                  ; r1 is the number with the larger exponent
    cmovl   r2, r0                  ; r2 is the number with the smaller exponent

; Shift the coefficients of r1 and r2 into r10 and r11 and unpack the exponents.

    mov     r10, r1                 ; r10 is the first number
    mov     r11, r2                 ; r11 is the second number
    movsx   r8, r1_b                ; r8 is the first exponent
    movsx   r9, r2_b                ; r9 is the second exponent
    sar     r10, 8                  ; r10 is the first coefficient
    sar     r11, 8                  ; r11 is the second coefficient
    mov     r0, r10                 ; r0 is the first coefficient
    pad

add_slower_decrease:

; The coefficients are not the same. Before we can add, they must be the same.
; We will try to decrease the first exponent. When we decrease the exponent
; by 1, we must also multiply the coefficient by 10. We can do this as long as
; there is no overflow. We have 8 extra bits to work with, so we can do this
; at least twice, possibly more.

    imul    r0, 10                  ; before decrementing the exponent, multiply
    jo      add_slower_increase
    sub     r8, 1                   ; decrease the first exponent
    mov     r10, r0                 ; r10 is the enlarged first coefficient
    cmp     r8, r9                  ; are the exponents equal yet?
    jg      add_slower_decrease
    mov     r0, r11                 ; r0 is the second coefficient

; The exponents are now equal, so the coefficients may be added.

    add     r0, r10                 ; add the two coefficients
    jmp     pack                    ; pack it up
    pad

add_slower_increase:

; We cannot decrease the first exponent any more, so we must instead try to
; increase the second exponent, which will result in a loss of significance.
; That is the heartbreak of floating point.

; Determine how many places need to be shifted. If it is more than 17, there is
; nothing more to add.

    mov     r2, r8                  ; r2 is the first exponent
    sub     r2, r9                  ; r2 is the remaining exponent difference
    mov     r0, r11                 ; r0 is the second coefficient
    cmp     r2, 17                  ; 17 is the max digits in a packed coefficient
    ja      return_r1               ; too small to matter
    mov     r9, power
    mov     r9, [r9+r2*8]           ; r9 is the power of ten
    cqo                             ; sign extend r0 into r2
    idiv    r9                      ; divide second coefficient by power of 10
    test    r0, r0                  ; examine the scaled coefficient
    jz      return_r1               ; too insignificant to add?

; The exponents are now equal, so the coefficients may be added.

    add     r0, r10                 ; add the two coefficients
    jmp     pack
    pad

return_r1:

    mov     r0, r1                  ; r0 is the original number
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_ceiling: function_with_one_parameter
;(number: dec64) returns integer: dec64

; Produce the smallest integer that is greater than or equal to the number. In
; the result, the exponent will be greater than or equal to zero unless it is
; nan. Numbers with positive exponents will not be modified, even if the
; numbers are outside of the safe integer range.

; Preserved: r11.

    mov     r9, 1                   ; r9 is the round up flag
    jmp     floor_begin

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_floor: function_with_one_parameter
;(number: dec64) returns integer: dec64

; Produce the largest integer that is less than or equal to the number. This
; is sometimes called the entier function. In the result, the exponent will be
; greater than or equal to zero unless it is nan. Numbers with positive
; exponents will not be modified, even if the numbers are outside of the safe
; integer range.

; Preserved: r11.

    mov     r9, -1                  ; r9 is the round down flag
    pad

floor_begin:

    cmp     r1_b, 128
    je      return_null
    mov     r0, r1                  ; r0 is the number
    movsx   r8, r1_b                ; r8 is the exponent
    sar     r0, 8                   ; r0 is the coefficient
    cmovz   r1, r0                  ; if coefficient is zero, the number is zero
    neg     r8                      ; r8 is the negated exponent
    test    r1_b, r1_b              ; examine the exponent
    jns     return_r1               ; nothing to do unless exponent was negative
    cmp     r8, 17                  ; is the exponent is too extreme?
    jae     floor_micro             ; deal with a micro number
    mov     r10, power
    mov     r10, [r10+r8*8]         ; r10 is the power of ten
    cqo                             ; sign extend r0 into r2
    idiv    r10                     ; divide r2:r0 by 10
    test    r2, r2                  ; examine the remainder
    jnz     floor_remains           ; deal with the remainder
    shl     r0, 8                   ; pack the coefficient
    ret
    pad

floor_micro:

    mov     r2, r0                  ; r2 is the coefficient
    xor     r0, r0                  ; r0 is zero
    pad

floor_remains:

; If the remainder is negative and the rounding flag is negative, then we need
; to decrement r0. But if the remainder and the rounding flag are both
; positive, then we need to increment r0.

    xor     r10, r10                ; r10 is zero
    xor     r2, r9                  ; xor the remainder and the rounding
    cmovs   r9, r10                 ; if different signs, clear the rounding
    add     r0, r9                  ; add the rounding to the coefficient
    shl     r0, 8                   ; pack the coefficient
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_subtract: function_with_two_parameters
;(minuend: dec64, subtrahend: dec64) returns difference: dec64

; Subtract the dec64 number in r2 from the dec64 number in r1.
; The result is in r0.

; This is the same as dec64_add, except that the operand in r2 has its
; coefficient complemented first.

    xor     r2, -256                ; not the coefficient only
    add     r2, 256                 ; two's complement increment of coefficient
    jno     add_begin               ; if there is no overflow, begin the beguine

; The subtrahend coefficient is -36028797018963968. This value cannot easily be
; complemented, so take the slower path. This should be extremely rare.

    cmp     r1_b, 128               ; is the first operand nan
    sete    r0_b
    cmp     r2_b, 128               ; is the second operand nan?
    sete    r0_h
    or      r0_b, r0_h              ; is either nan?
    jnz     return_null
    mov     r10, r1                 ; r10 is the first coefficient
    movsx   r8, r1_b                ; r8 is the first exponent
    sar     r10, 8
    movsx   r9, r2_b                ; r9 is the second exponent
    mov     r11, 80000000000000H    ; r11 is 36028797018963968
    mov     r0, r10                 ; r0 is the first coefficient
    cmp     r8, r9                  ; if the second exponent is larger, swap
    jge     subtract_slower_decrease_compare
    mov     r0, r11                 ; r0 is the second coefficient
    xchg    r8, r9                  ; swap the exponents
    xchg    r10, r11                ; swap the coefficients
    jmp     subtract_slower_decrease_compare
    pad

subtract_slower_decrease:

; The coefficients are not the same. Before we can add, they must be the same.
; We will try to decrease the first exponent. When we decrease the exponent
; by 1, we must also multiply the coefficient by 10. We can do this as long as
; there is no overflow. We have 8 extra bits to work with, so we can do this
; at least twice, possibly more.

    imul    r0, 10                  ; before decrementing the exponent, multiply
    jo      subtract_slower_increase
    sub     r8, 1                   ; decrease the first exponent
    mov     r10, r0                 ; r10 is the enlarged first coefficient
    pad

subtract_slower_decrease_compare:

    cmp     r8, r9                  ; are the exponents equal yet?
    jg      subtract_slower_decrease
    mov     r0, r11                 ; r0 is the second coefficient

; The exponents are now equal, so the coefficients may be added.

    add     r0, r10                 ; add the two coefficients
    jmp     pack                    ; pack it up
    pad

subtract_slower_increase:

; We cannot decrease the first exponent any more, so we must instead try to
; increase the second exponent, which will result in a loss of significance.
; That is the heartbreak of floating point.

; Determine how many places need to be shifted. If it is more than 17, there is
; nothing more to add.

    mov     r2, r8                  ; r2 is the first exponent
    sub     r2, r9                  ; r2 is the remaining exponent difference
    mov     r0, r11                 ; r0 is the second coefficient
    cmp     r2, 17                  ; 17 is the max digits in packed coefficient
    ja      subtract_underflow      ; too small to matter
    mov     r9, power
    mov     r9, [r9+r2*8]           ; r9 is the power of ten
    cqo                             ; sign extend r0 into r2
    idiv    r9                      ; divide second coefficient by power of 10

; The exponents are now equal, so the coefficients may be added.

    add     r0, r10                 ; add the two coefficients
    jmp     pack
    pad

subtract_underflow:

    mov     r0, r10                 ; r0 is the first coefficient
    jmp     pack

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_multiply: function_with_two_parameters
;(multiplicand: dec64, multiplier: dec64) returns product: dec64

; Multiply two dec64 numbers together.

; Unpack the exponents into r8 and r9.

    movsx   r8, r1_b                ; r8 is the first exponent
    movsx   r9, r2_b                ; r9 is the second exponent

; Set flags in r0 if either operand is nan.

    cmp     r1_b, 128               ; is the first operand nan?
    sete    r0_b                    ; r0_b is 1 if the first operand is nan
    cmp     r2_b, 128               ; is the second operand nan?
    sete    r0_h                    ; r0_h is 1 if the second operand is nan

; Unpack the coefficients. Set flags in r1 if either is not zero.

    sar     r1, 8                   ; r1 is the first coefficient
    mov     r10, r1                 ; r10 is the first coefficient
    setnz   r1_b                    ; r1_b is 1 if first coefficient is not zero
    sar     r2, 8                   ; r2 is the second coefficient
    setnz   r1_h                    ; r1_h is 1 if second coefficient is not 0

; The result is nan if one or both of the operands is nan and neither of the
; operands is zero.

    or      r1_w, r0_w              ; is either coefficient zero and not nan?
    xchg    r1_b, r1_h
    test    r0_w, r1_w
    jnz     return_null

    mov     r0, r10                 ; r0 is the first coefficient
    add     r8, r9                  ; r8 is the product exponent
    imul    r2                      ; r2:r0 is r1 * r2
    jno     pack                    ; if product fits in 64 bits, start packing

; There was overflow.

; Make the 110 bit coefficient in r2:r0Er8 all fit. Estimate the number of
; digits of excess, and increase the exponent by that many digits.
; We use 77/256 to convert log2 to log10.

    mov     r9, r2                  ; r9 is the excess significance
    xor     r1, r1                  ; r1 is zero anticipating bsr
    neg     r9
    cmovs   r9, r2                  ; r9 is abs of excess significance

    bsr     r1, r9                  ; find the position of most significant bit
    imul    r1, 77                  ; multiply the bit number by 77/256 to
    shr     r1, 8                   ;     convert a bit number to a digit number
    add     r1, 2                   ; add two extra digits to the scale
    add     r8, r1                  ; increase the exponent
    mov     r9, power
    idiv    qword [r9+r1*8]         ; divide by the power of ten
    jmp     pack

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_divide: function_with_two_parameters
;(dividend: dec64, divisor: dec64) returns quotient: dec64

; Divide a dec64 number by another.

; Begin unpacking the components.

    cmp     r2, 200h
    jz      divide_two
    movsx   r8, r1_b                ; r8 is the first exponent
    movsx   r9, r2_b                ; r9 is the second exponent
    mov     r10, r1                 ; r10 is the first number
    mov     r11, r2                 ; r11 is the second number

; Set nan flags in r0.

    cmp     r1_b, 128               ; is the first operand nan?
    sete    r0_b                    ; r0_b is 1 if the first operand is nan
    cmp     r2_b, 128               ; is the second operand nan?
    sete    r0_h                    ; r0_h is 1 if the second operand is nan

    sar     r10, 8                  ; r10 is the dividend coefficient
    setnz   r1_b                    ; r1_b is 1 if dividend coefficient is not 0
    or      r1_b, r0_b              ; r0_b is 1 if either is nan or divisor is 0
    jz      return_zero             ; if the dividend is zero,  quotient is zero
    sar     r11, 8                  ; r11 is the divisor coefficient
    setz    r1_b                    ; r1_h is 1 if dividing by zero
    or      r0_h, r0_b              ; r0_h is 1 if either is nan
    or      r1_b, r0_h              ; r1_b is zero if dividend is zero & not nan
    jnz     return_null
    sub     r8, r9                  ; r8 is the quotient exponent
    pad

divide_measure:

; We want to get as many bits into the quotient as possible in order to capture
; enough significance. But if the quotient has more than 64 bits, then there
; will be a hardware fault. To avoid that, we compare the magnitudes of the
; dividend coefficient and divisor coefficient, and use that to scale the
; dividend to give us a good quotient.

    mov     r0, r10                 ; r0 is the first coefficient
    mov     r1, r11                 ; r1 is the second coefficient
    neg     r0                      ; r0 is negated
    cmovs   r0, r10                 ; r0 is abs of dividend coefficient
    neg     r1                      ; r1 is negated
    cmovs   r1, r11                 ; r1 is abs of divisor coefficient
    bsr     r0, r0                  ; r0 is the dividend most significant bit
    bsr     r1, r1                  ; r1 is the divisor most significant bit

; Scale up the dividend to be approximately 58 bits longer than the divisor.
; Scaling uses factors of 10, so we must convert from a bit count to a digit
; count by multiplication by 77/256 (approximately LN2/LN10).

    add     r1, 58                  ; we want approx 58 bits in raw quotient
    sub     r1, r0                  ; r1 is the number of bits to add to the dividend
    imul    r1, 77                  ; multiply by 77/256 to convert bits|digits
    shr     r1, 8                   ; r1 is number of digits to scale dividend

; The largest power of 10 that can be held in an int64 is 1e18.

    cmp     r1, 18                  ; prescale the dividend if 10**r1 won't fit
    jg      divide_prescale

; Multiply the dividend by the scale factor, and divide that 128 bit result by
; the divisor.  Because of the scaling, the quotient is guaranteed to use most
; of the 64 bits in r0, and never more. Reduce the final exponent by the number
; of digits scaled.

    mov     r0, r10                 ; r0 is the dividend coefficient
    mov     r9, power
    imul    qword [r9+r1*8]         ; r2:r0 is the dividend coefficient * 10**r1
    idiv    r11                     ; r0 is the quotient
    sub     r8, r1                  ; r8 is the exponent
    jmp     pack                    ; pack it up
    pad

divide_prescale:

; If the number of scaling digits is larger than 18, then we will have to
; scale in two steps: first prescaling the dividend to fill a register, and
; then repeating to fill a second register. This happens when the divisor
; coefficient is much larger than the dividend coefficient.

    mov     r1, 58                  ; we want 58 bits or so in the dividend
    sub     r1, r0                  ; r1 is the number of additional bits needed
    imul    r1, 77                  ; convert bits to digits
    shr     r1, 8                   ; shift 8 is cheaper than div 256
    mov     r9, power
    imul    r10, qword [r9+r1*8]    ; multiply the dividend by power of ten
    sub     r8, r1                  ; reduce the exponent
    jmp     divide_measure          ; try again

divide_two:

; Divide a dec64 number by two.

    cmp     r1_b, 128
    je      return_null
    test    r1_h, 1
    jz      divide_half

; Unpack the components into r8 and r1, multiply by 5 and divide by 10.

    movsx   r8, r1_b                ; r8 is the exponent
    sar     r1, 8                   ; r1 is the coefficient
    sub     r8, 1                   ; bump down the exponent
    lea     r0, [r1+r1*4]           ; r0 is the coefficient * 5
    jmp     pack
    pad

divide_half:

; If the least significant bit of the coefficient is 0, then we can do this
; the fast way. Shift the coefficient by 1 bit and restore the exponent. If
; the shift produces zero, even easier.

    mov     r0, -256                ; r0 is the coefficient mask
    and     r0, r1                  ; r0 is the coefficient shifted 8 bits
    jz      return
    sar     r0, 1                   ; r0 is divided by 2
    movzx   r1, r1_b                ; zero out r1 except lowest 8 bits
    or      r0, r1                  ; mix in the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_integer_divide: function_with_two_parameters
;(dividend: dec64, divisor: dec64) returns quotient: dec64

; Divide, with a floored integer result. It produces the same result as
;    dec64_floor(dec64_divide(dividend, divisor))
; but can sometimes produce that result more quickly.

    cmp     r1_b, r2_b              ; are the exponents equal?
    jne     integer_divide_slow

    mov     r0, r1                  ; r0 is the dividend
    mov     r11, r2                 ; r11 is the divisor
    sar     r1, 8                   ; r1 is the dividend coefficient
    setnz   r2_h                    ; r2_h is 1 if dividend coefficient is not 0
    cmp     r0_b, 128               ; are the operands nan?
    sete    r2_b                    ; r2_b is 1 if the operands are nan
    and     r0, -256                ; zero the dividend's exponent
    or      r2_h, r2_b              ; r2_h is zero if dividend is 0 and not nan
    jz      return_zero             ; the quotient is zero if dividend is zero
    sar     r11, 8                  ; r11 is the divisor coefficient
    setz    r2_h                    ; r2_h is 1 if divisor coefficient is zero
    or      r2_b, r2_h              ; r2_b is 1 if the result is nan
    jnz     return_null
    cqo                             ; sign extend r0 into r2
    idiv    r11                     ; r0 is the quotient
    and     r0, -256                ; zero the exponent again
    ret                             ; no need to pack
    pad

integer_divide_slow:

; The exponents are not the same, so do it the hard way.

    call_with_two_parameters dec64_divide
    mov     r1, r0
    tail_with_one_parameter dec64_floor

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_modulo: function_with_two_parameters
;(dividend: dec64, divisor: dec64) returns modulus: dec64

; Modulo. It produces the same result as
;    dec64_subtract(
;        dividend,
;        dec64_multiply(
;            dec64_integer_divide(dividend, divisor),
;            divisor
;        )
;    )

    cmp     r1_b, r2_b              ; are the two exponents the same?
    jnz     modulo_slow             ; if not take the slow path
    mov     r0, r1                  ; r0 is the dividend
    mov     r11, r2                 ; r11 is the divisor
    cmp     r1_b, 128               ; is the first operand nan?
    setne   r2_h                    ; r2_h is 1 if the operands are not nan
    sar     r0, 8                   ; r0 is the dividend coefficient
    setz    r2_b                    ; r2_b is 1 if dividend coefficient is zero
    test    r2_b, r2_h              ; is the dividend is zero and not nan?
    jnz     return_zero             ; quotient is zero if the dividend is zero
    sar     r11, 8                  ; r11 is the divisor coefficient
    setnz   r2_b                    ; r2_b is 1 if divisor coefficient is not 0
    test    r2_h, r2_b              ; is either operand nan or is divisor zero?
    jz      return_null
    cqo                             ; sign extend r0 into r2
    idiv    r11                     ; divide r2:r0 by the divisor

; If the signs of the divisor and remainder are different and the remainder is
; not zero, add the divisor to the remainder.

    xor     r0, r0                  ; r0 is zero
    mov     r10, r2                 ; r10 is the remainder
    test    r2, r2                  ; examine the remainder
    cmovz   r11, r0                 ; r11 is zero if the remainder is zero
    xor     r10, r11                ; r10 is remainder xor divisor
    cmovs   r0, r11                 ; r0 is divisor if the signs were different
    add     r0, r2                  ; r0 is the corrected result
    cmovz   r1, r0                  ; if r0 is zero, so is the exponent
    shl     r0, 8                   ; position the coefficient
    mov     r0_b, r1_b              ; mix in the exponent
    ret
    pad

modulo_slow:

; The exponents are not the same, so do it the hard way.

    push    r1                      ; save the dividend
    push    r2                      ; save the divisor

    call_with_two_parameters dec64_integer_divide
    pop     r2
    mov     r1, r0
    call_with_two_parameters dec64_multiply
    pop     r1
    mov     r2, r0
    tail_with_two_parameters dec64_subtract

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_signum: function_with_one_parameter
;(number: dec64) returns signature: dec64

; If the number is nan, the result is nan.
; If the number is less than zero, the result is -1.
; If the number is zero, the result is 0.
; If the number is greater than zero, the result is 1.

    cmp     r1_b, 128
    jz      return_null
    mov     r0, r1                  ; r0 is the number
    sar     r0, 8                   ; r0 is the coefficient
    mov     r8, r0                  ; r8 is the coefficient too
    neg     r0                      ; r0 is -coefficient
    sar     r8, 63                  ; r8 is extended sign: -1 if negative else 0
    shr     r0, 63                  ; r0 is 1 if the number was greater than 0
    or      r0, r8                  ; r0 is -1, 0, or 1
    shl     r0, 8                   ; package the number with a zero exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_neg: function_with_one_parameter
;(number: dec64) returns negation: dec64

; Negate a number. We need to negate the coefficient without changing the
; exponent.

    cmp     r1_b, 128               ; compare the exponent to nan
    je      return_null             ; is the number nan?
    mov     r2, r1                  ; r2 is the number
    mov     r0, -256                ; r0 is the coefficient mask
    sar     r2, 8                   ; r2 is the coefficient
    cmovz   r1, r2                  ; if coefficient is zero, then exponent too
    xor     r0, r1                  ; r0 has exponent and complemented coef
    add     r0, 256                 ; r0 has exponent and negated coefficient
    jo      neg_overflow            ; nice day if it don't overflow
    ret

neg_overflow:

; The coefficient is -36028797018963968, which is the only coefficient that
; cannot be trivially negated. So we do this the hard way.

    mov     r0, r2
    movsx   r8, r1_b                ; r8 is the exponent
    neg     r0
    jmp     pack

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_abs: function_with_one_parameter
;(number: dec64) returns absolution: dec64

; Find the absolute value of a number. If the number is negative, hand it off
; to dec64_neg. Otherwise, return the number unless it is nan or zero.

    cmp     r1_b, 128               ; is the number nan?
    je      return_null
    test    r1, r1                  ; examine r1
    js      dec64_neg               ; is the number negative?
    mov     r0, r1                  ; r0 is the number
    sar     r1, 8                   ; r1 is the coefficient
    cmovz   r0, r1                  ; if coefficient is zero, result is zero
    ret                             ; return the number

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_is_equal: function_with_two_parameters
;(comparahend: dec64, comparator: dec64) returns comparison: dec64

; Compare two dec64 numbers. If they are equal, return 1, otherwise return 0.
; Denormal zeroes are equal but denormal nans are not.

; If the numbers are trivally equal, then return 1.

    cmp     r1, r2                  ; compare the two numbers
    je      return_true

; If the comparator is nan, then we ask dec64_is_nan.

    cmp     r2_b, 128
    je      dec64_is_nan

; If the exponents match or if their signs are different, then return false.

    mov     r0, r1                  ; r0 is the first number
    xor     r0, r2                  ; r0 the xor of the two numbers
    sets    r0_h                    ; r0_h is 1 if the signs are different
    cmp     r1_b, r2_b              ; compare the two exponents
    sete    r0_b                    ; r0_b is 1 if the exponents are the same
    or      r0_b, r0_h
    jnz     return_false

; Do it the hard way by subtraction. Is the difference zero?

    call_with_two_parameters dec64_subtract ; r0 is r1 - r2
    cmp     r0_b, 128               ; is the difference nan?
    je      return_false
    or      r0, r0                  ; examine the difference
    jz      return_true
    jmp     return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_normal: function_with_one_parameter
;(number: dec64) returns normalization: dec64

; Make the exponent as close to zero as possible without losing any signficance.
; Usually normalization is not needed since it does not materially change the
; value of a number.

; Preserves r10 and r11.

    mov     r0, r1                  ; r0 is the number
    cmp     r1_b, 128               ; compare the exponent to nan
    jz      return_null             ; if exponent is nan, the result is nan
    and     r0, -256                ; r0 is the coefficient shifted 8 bits
    mov     r8, 10                  ; r8 is the divisor
    cmovz   r1, r0                  ; r1 is zero if r0 is zero
    mov     r9, r0                  ; r9 is the coefficient shifted 8 bits
    test    r1_b, r1_b              ; examine the exponent
    jz      return                  ; if the exponent is zero, return r0
    jns     normal_multiply         ; if the exponent is positive
    sar     r0, 8                   ; r0 is the coefficient
    sar     r9, 8                   ; r9 is the coefficient
    pad

normal_divide:

; While the exponent is less than zero, divide the coefficient by 10 and
; increment the exponent.

    cqo                             ; sign extend r0 into r2
    idiv    r8                      ; divide r2:r0 by 10
    test    r2, r2                  ; examine the remainder
    jnz     normal_divide_done      ; if r2 is not zero, we are done
    mov     r9, r0                  ; r9 is the coefficient
    add     r1_b, 1                 ; increment the exponent
    jnz     normal_divide           ; until the exponent is zero
    pad

normal_divide_done:

    mov     r0, r9                  ; r0 is the finished coefficient
    shl     r0, 8                   ; put it in position
    mov     r0_b, r1_b              ; mix in the exponent
    ret
    pad

normal_multiply:

; While the exponent is greater than zero, multiply the coefficient by 10 and
; decrement the exponent. If the coefficient gets too large, wrap it up.

    imul    r0, 10                  ; r0 is r0 * 10
    jo      normal_multiply_done    ; return zero if overflow
    mov     r9, r0                  ; r9 is the coefficient
    sub     r1_b, 1                 ; decrement the exponent
    jnz     normal_multiply         ; until the exponent is zero
    ret
    pad

normal_multiply_done:

    mov     r0, r9                  ; r0 is the finished positioned coefficient
    mov     r0_b, r1_b              ; mix in the exponent
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

return:

; Return whatever is in r0.

    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

return_null:

; All of the dec64_ functions return only this form of nan.

    mov     r0, null
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

return_one:

    mov     r0, 0100h               ; one
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

return_zero:

    xor     r0, r0                  ; zero
    ret

    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_is_less: function_with_two_parameters
;(comparahend: dec64, comparator: dec64) returns comparison: dec64

; Compare two dec64 numbers. If the first is less than the second, return true,
; otherwise return false. Any nan value is greater than any number value

; The other 3 comparison functions are easily implemented with dec64_is_less:

;    dec64_is_greater(a, b)          => dec64_is_less(b, a)
;    dec64_is_greater_or_equal(a, b) => dec64_is_false(dec64_is_less(a, b))
;    dec64_is_less_or_equal(a, b)    => dec64_is_false(dec64_is_less(b, a))

; If the exponents are the same, then do a simple compare.

    cmp     r1_b, r2_b              ; are the two exponents equal?
    jne     less_slow               ; deal with unequal exponents
    cmp     r1, r2                  ; compare the numbers
    jge     return_false            ; greater or equal
    cmp     r1_b, 128               ; are the arguments nan?
    jne     return_true             ; less than
    jmp     return_false            ; nan is not less than nan
    pad

less_slow:

 ; The exponents are not the same.

    cmp     r1_b, 128               ; is the first argument nan?
    je      return_false            ; nan is greater than numbers
    cmp     r2_b, 128               ; is the second argument nan?
    je      return_true             ; numbers are less than nan

; Unpack the numbers.

    mov     r11, r2                 ; r11 is the second number
    movsx   r8, r1_b                ; r8 is the first exponent
    movsx   r9, r2_b                ; r9 is the second exponent
    mov     r0, power               ; r0 is the power array
    mov     r2, 18                  ; r2 is 18
    sar     r1, 8                   ; r1 is the first coefficient
    sar     r11, 8                  ; r11 is the second coefficient

; The maximum cofficient is 36028797018963967. 10**18 is more than that.

    sub     r8, r9                  ; r8 is the exponent difference
    js      less_second             ; is the difference negative?
    cmp     r8, r2                  ; compare the exponent difference to 18
    cmovge  r8, r2                  ; r8 is min(exponent difference, 18)

; We need to make them conform before we can compare. Multiply the first
; coefficient by 10**(first exponent - second exponent)

    mov     r0, [r0+r8*8]           ; r0 is power[difference]
    imul    r1                      ; r2, r0 is the amplified first coefficient
    jo      less_overflow_first
    cmp     r0, r11                 ; is first coefficient less than the other?
    jl      return_true
    jmp     return_false
    pad

less_second:

    neg     r8
    cmp     r8, r2                  ; compare the exponent difference to 18
    cmovge  r8, r2                  ; r8 is min(exponent difference, 18)
    mov     r0, [r0+r8*8]           ; r0 is 10 ** min(exponent difference, 18)
    imul    r11                     ; r2, r0 is the amplified second coefficient
    jo      less_overflow_second
    cmp     r1, r0                  ; is first coefficient less than the other?
    jl      return_true
    jmp     return_false
    pad

less_overflow_first:

    test    r2, r2
    jns     return_false
    jmp     return_true
    pad

less_overflow_second:

    test    r2, r2
    jns     return_true
    jmp     return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_is_integer: function_with_one_parameter
;(number: dec64) returns comparison: dec64

; If the number contains a non-zero fractional part or if it is nan, return
; false. Otherwise, return true.

    cmp     r1_b, 128               ; nan exponent?
    je      return_false            ; nan is not an integer
    mov     r0, r1                  ; r0 is the number
    sar     r0, 8                   ; r0 is the coefficient
    jz      return_true             ; zero is an integer

    test    r1_b, r1_b              ; examine the exponent
    jns     return_true             ; a positive exponent means an integer
    cmp     r1_b, -17               ; huge negative exponents can never be int
    jl      return_false
    movsx   r8, r1_b                ; r8 is the first exponent
    neg     r8                      ; negate the exponent
    mov     r9, power
    mov     r10, [r9+r8*8]          ; r10 is 10^-exponent
    cqo                             ; sign extend r0 into r2
    idiv    r10                     ; divide r2:r0 by the power of ten
    test    r2, r2                  ; examine the remainder
    jz      return_true             ; if the remainder is zero, then return true
    jmp     return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_is_nan: function_with_one_parameter
;(number: dec64) returns comparison: dec64

    cmp     r1_b, 128               ; is r1 nan?
    je      return_true
    jmp     return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_is_zero: function_with_one_parameter
;(number: dec64) returns comparison: dec64

    cmp     r1_b, 128               ; is r1 nan?
    setne   r0_b                    ; r0_b is 1 if not nan
    sar     r1, 8                   ; r1 is the coefficient
    setz    r0_h                    ; r0_h is 1 if zero
    test    r0_b, r0_h
    jnz     return_true
    jmp     return_false
    pad; -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --

dec64_is_false: function_with_one_parameter
;(boolean: dec64) returns notation: dec64

; If the argument is false, the result is true.
; Otherwise, the result is false.

    mov     r0, false
    cmp     r1, r0
    je      return_true

return_false:

    mov     r0, false
    ret

return_true:

    mov     r0, true
    ret

## test_android.sh
#!/bin/bash

# This script builds and runs the DEC64 tests for an ARM64 Android device. It
# takes no arguments.

# DEC64's ARM source must be tweaked slightly to make it compatible with
# the Android NDK's clang:

#   0. Comments start with a double slash, not a semicolon.
#   1. The global directive, like all directives, is preceeded by a period.
#   2. The area directive is not supported. We do, however, align.
#   3. Labels must be suffixed with a colon.
#   4. An 8 byte constant is declared with .quad rather than dcq.
#   5. When shifting an address offset, an extra comma is required.
#   6. The end directive is unnecessary.

sed \
    -E \
    -e "s_;_//_g" \
    -e "s/global ([a-z0-9_]+) \\[func\\]/.global \1/g" \
    -e "s/area dec64, align=8, code, readonly/.align 8/g" \
    -e "s/^([a-z0-9_]+)/\1:/g" \
    -e "s/dcq/.quad/g" \
    -e "s/x([0-9]) lsl 3/x\1, lsl 3/g" \
    -e "/    end/d" \
    <dec64.s \
    >dec64.android.s \
|| exit

build_and_run() {

# Compile, assemble and link a source file into a binary executable, then
# transfer and run that executable on a connected Android device.

    src=$1
    bin=$2

# The tests shift negative integers on purpose, so we suppress that warning.

    "NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android32-clang" \
        -Wno-shift-negative-value \
        -o $bin \
        dec64_string.c \
        dec64_math.c \
        dec64.android.s \
        $src \
    && adb push ./$bin /data/local/tmp/$bin \
    && adb shell /data/local/tmp/$bin
}

build_and_run dec64_test.c dec64_test \
&& build_and_run dec64_string_test.c dec64_string_test \
&& build_and_run dec64_math_test.c dec64_math_test \
|| exit

# This script has been tested on a Samsung A52 5G device running Android 12,
# using the Android NDK r25b.

## test_darwin_intel.sh
#!/bin/sh

# This script builds and runs the DEC64 tests for an x64 machine running MacOS
# (Darwin). It takes no arguments.

# Assemble DEC64 into dec64.o. We prefix the function names with an underscore,
# because that is what clang expects.

nasm \
    -f macho64 \
    --prefix _ \
    dec64.nasm \
|| exit

build() {

# Compile, assemble and link a source file into a binary executable.

# We opt out of creating a position independent executable (a PIE), because we
# depend on absolute addressing. Without passing the linker the -no_pie flag,
# we get a warning like

#   PIE disabled. Absolute addressing (perhaps -mdynamic-no-pic) not allowed in
#   code signed PIE, but used in pack_increase from dec64.o.

# because of lines in dec64.nasm like

#   mov     r10, [r10+r9*8]

# The tests shift negative integers on purpose, so we suppress that warning.

    src=$1
    bin=$2
    clang \
        -Wl,-no_pie \
        -Wno-shift-negative-value \
        -o $bin \
        dec64_string.c \
        dec64_math.c \
        dec64.o \
        $src
}

# Build the tests.

build dec64_test.c dec64_test \
&& build dec64_string_test.c dec64_string_test \
&& build dec64_math_test.c dec64_math_test \
|| exit

# Run the tests.

./dec64_test \
&& ./dec64_string_test \
&& ./dec64_math_test \
|| exit

# This script has been tested on MacOS 11.4 (Big Sur), using NASM
# v2.15.05 and clang 13.0.0.

## test_darwin_silicon.sh
#!/bin/bash

# This script builds and runs the DEC64 tests for a Macintosh with an Apple
# Silicon processor. It takes no arguments.

# DEC64's ARM source must be tweaked slightly to make it compatible with clang:

#   0. Comments start with a double slash, not a semicolon.
#   1. The global directive, like all directives, is preceeded by a period.
#   2. The area directive is not supported. We do, however, align.
#   3. Labels must be suffixed with a colon.
#   4. An 8 byte constant is declared with .quad rather than dcq.
#   5. When shifting an address offset, an extra comma is required.
#   6. The end directive is unnecessary.
#   7. The names of entrypoints require a leading underscore to link correctly.

sed \
    -E \
    -e "s_;_//_g" \
    -e "s/global ([a-z0-9_]+) \\[func\\]/.global \1/g" \
    -e "s/area dec64, align=8, code, readonly/.align 8/g" \
    -e "s/^([a-z0-9_]+)/\1:/g" \
    -e "s/dcq/.quad/g" \
    -e "s/x([0-9]) lsl 3/x\1, lsl 3/g" \
    -e "/    end/d" \
    -e "s/dec64_/_dec64_/g" \
    <dec64.s \
    >dec64.silicon.s \
|| exit

build_and_run() {

# Compile, assemble and link a source file into a binary executable, then
# run it.

    src=$1
    bin=$2

# The tests shift negative integers on purpose, so we suppress that warning.

    clang \
        -Wno-shift-negative-value \
        -o $bin \
        dec64.silicon.s \
        dec64_string.c \
        dec64_math.c \
        $src \
    && ./$bin
}

build_and_run dec64_test.c dec64_test \
&& build_and_run dec64_string_test.c dec64_string_test \
&& build_and_run dec64_math_test.c dec64_math_test \
|| exit

# This script has been tested on a 2021 MacBook Pro with an M1 Pro processor.

## test_linux.sh
#!/bin/sh

# This script builds and runs the DEC64 tests for an x64 machine running Linux.
# It takes no arguments.

# Assemble DEC64 into dec64.o.

nasm -f elf64 dec64.nasm || exit

build() {

# Compile, assemble and link a source file into a binary executable.

# We opt out of creating a position independent executable (a PIE), because we
# depend on absolute addressing. Without passing the the -no-pie flag, we get a
# warning like

#   /usr/bin/ld: dec64.o: warning: relocation in read-only section `.text'
#   /usr/bin/ld: warning: creating DT_TEXTREL in a PIE

# because of lines in dec64.nasm like

#   mov     r10, [r10+r9*8]

    src=$1
    bin=$2
    gcc \
        -no-pie \
        -o $bin \
        dec64_string.c \
        dec64_math.c \
        dec64.o \
        $src
}

# Build the tests.

build dec64_test.c dec64_test \
&& build dec64_string_test.c dec64_string_test \
&& build dec64_math_test.c dec64_math_test \
|| exit

# Run the tests.

./dec64_test \
&& ./dec64_string_test \
&& ./dec64_math_test \
|| exit

# This script has been tested on Debian GNU/Linux 11 (Bullseye), using NASM
# v2.15.05 and gcc v10.2.1.
	#!/bin/bash

	# This script builds and runs the DEC64 tests for an ARM64 Android device. It
	# takes no arguments.

	# DEC64's ARM source must be tweaked slightly to make it compatible with
	# the Android NDK's clang:

	# 0. Comments start with a double slash, not a semicolon.
	# 1. The global directive, like all directives, is preceeded by a period.
	# 2. The area directive is not supported. We do, however, align.
	# 3. Labels must be suffixed with a colon.
	# 4. An 8 byte constant is declared with .quad rather than dcq.
	# 5. When shifting an address offset, an extra comma is required.
	# 6. The end directive is unnecessary.

	sed \
	-E \
	-e "s_;_//_g" \
	-e "s/global ([a-z0-9_]+) \\[func\\]/.global \1/g" \
	-e "s/area dec64, align=8, code, readonly/.align 8/g" \
	-e "s/^([a-z0-9_]+)/\1:/g" \
	-e "s/dcq/.quad/g" \
	-e "s/x([0-9]) lsl 3/x\1, lsl 3/g" \
	-e "/ end/d" \
	<dec64.s \
	>dec64.android.s \
	\|\| exit

	build_and_run() {

	# Compile, assemble and link a source file into a binary executable, then
	# transfer and run that executable on a connected Android device.

	src=$1
	bin=$2

	# The tests shift negative integers on purpose, so we suppress that warning.

	"NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android32-clang" \
	-Wno-shift-negative-value \
	-o $bin \
	dec64_string.c \
	dec64_math.c \
	dec64.android.s \
	$src \
	&& adb push ./$bin /data/local/tmp/$bin \
	&& adb shell /data/local/tmp/$bin
	}

	build_and_run dec64_test.c dec64_test \
	&& build_and_run dec64_string_test.c dec64_string_test \
	&& build_and_run dec64_math_test.c dec64_math_test \
	\|\| exit

	# This script has been tested on a Samsung A52 5G device running Android 12,
	# using the Android NDK r25b.
	#!/bin/sh

	# This script builds and runs the DEC64 tests for an x64 machine running MacOS
	# (Darwin). It takes no arguments.

	# Assemble DEC64 into dec64.o. We prefix the function names with an underscore,
	# because that is what clang expects.

	nasm \
	-f macho64 \
	--prefix _ \
	dec64.nasm \
	\|\| exit

	build() {

	# Compile, assemble and link a source file into a binary executable.

	# We opt out of creating a position independent executable (a PIE), because we
	# depend on absolute addressing. Without passing the linker the -no_pie flag,
	# we get a warning like

	# PIE disabled. Absolute addressing (perhaps -mdynamic-no-pic) not allowed in
	# code signed PIE, but used in pack_increase from dec64.o.

	# because of lines in dec64.nasm like

	# mov r10, [r10+r9*8]

	# The tests shift negative integers on purpose, so we suppress that warning.

	src=$1
	bin=$2
	clang \
	-Wl,-no_pie \
	-Wno-shift-negative-value \
	-o $bin \
	dec64_string.c \
	dec64_math.c \
	dec64.o \
	$src
	}

	# Build the tests.

	build dec64_test.c dec64_test \
	&& build dec64_string_test.c dec64_string_test \
	&& build dec64_math_test.c dec64_math_test \
	\|\| exit

	# Run the tests.

	./dec64_test \
	&& ./dec64_string_test \
	&& ./dec64_math_test \
	\|\| exit

	# This script has been tested on MacOS 11.4 (Big Sur), using NASM
	# v2.15.05 and clang 13.0.0.
	#!/bin/bash

	# This script builds and runs the DEC64 tests for a Macintosh with an Apple
	# Silicon processor. It takes no arguments.

	# DEC64's ARM source must be tweaked slightly to make it compatible with clang:

	# 0. Comments start with a double slash, not a semicolon.
	# 1. The global directive, like all directives, is preceeded by a period.
	# 2. The area directive is not supported. We do, however, align.
	# 3. Labels must be suffixed with a colon.
	# 4. An 8 byte constant is declared with .quad rather than dcq.
	# 5. When shifting an address offset, an extra comma is required.
	# 6. The end directive is unnecessary.
	# 7. The names of entrypoints require a leading underscore to link correctly.

	sed \
	-E \
	-e "s_;_//_g" \
	-e "s/global ([a-z0-9_]+) \\[func\\]/.global \1/g" \
	-e "s/area dec64, align=8, code, readonly/.align 8/g" \
	-e "s/^([a-z0-9_]+)/\1:/g" \
	-e "s/dcq/.quad/g" \
	-e "s/x([0-9]) lsl 3/x\1, lsl 3/g" \
	-e "/ end/d" \
	-e "s/dec64_/_dec64_/g" \
	<dec64.s \
	>dec64.silicon.s \
	\|\| exit

	build_and_run() {

	# Compile, assemble and link a source file into a binary executable, then
	# run it.

	src=$1
	bin=$2

	# The tests shift negative integers on purpose, so we suppress that warning.

	clang \
	-Wno-shift-negative-value \
	-o $bin \
	dec64.silicon.s \
	dec64_string.c \
	dec64_math.c \
	$src \
	&& ./$bin
	}

	build_and_run dec64_test.c dec64_test \
	&& build_and_run dec64_string_test.c dec64_string_test \
	&& build_and_run dec64_math_test.c dec64_math_test \
	\|\| exit

	# This script has been tested on a 2021 MacBook Pro with an M1 Pro processor.
	#!/bin/sh

	# This script builds and runs the DEC64 tests for an x64 machine running Linux.
	# It takes no arguments.

	# Assemble DEC64 into dec64.o.

	nasm -f elf64 dec64.nasm \|\| exit

	build() {

	# Compile, assemble and link a source file into a binary executable.

	# We opt out of creating a position independent executable (a PIE), because we
	# depend on absolute addressing. Without passing the the -no-pie flag, we get a
	# warning like

	# /usr/bin/ld: dec64.o: warning: relocation in read-only section `.text'
	# /usr/bin/ld: warning: creating DT_TEXTREL in a PIE

	# because of lines in dec64.nasm like

	# mov r10, [r10+r9*8]

	src=$1
	bin=$2
	gcc \
	-no-pie \
	-o $bin \
	dec64_string.c \
	dec64_math.c \
	dec64.o \
	$src
	}

	# Build the tests.

	build dec64_test.c dec64_test \
	&& build dec64_string_test.c dec64_string_test \
	&& build dec64_math_test.c dec64_math_test \
	\|\| exit

	# Run the tests.

	./dec64_test \
	&& ./dec64_string_test \
	&& ./dec64_math_test \
	\|\| exit

	# This script has been tested on Debian GNU/Linux 11 (Bullseye), using NASM
	# v2.15.05 and gcc v10.2.1.