kubo39/phobos-float-parrsing.md

## phobos-float-parrsing.md

      
    Raw
  

              phobos-float-parrsing.md
            
          
    D言語のfloat-parsing algorithm ("somestr".parse!double) について

Phobosのfloat parsing algorithmをみていく。
単純化したもので考える。
nanとかinfとかはみればわかるので(単に文字比較しているだけ)そこは飛ばす。
それでも結構長いな。
/**
 * Parses a character range to a floating point number.
 *
 * Params:
 *     Target = a floating point type
 *     source = the lvalue of the range to _parse
 *
 * Returns:
 $(UL
 *     $(LI A floating point number of type `Target`)
 *
 * Throws:
 *     A $(LREF ConvException) if `source` is empty, if no number could be
 *     parsed, or if an overflow occurred.
 */
auto parse(Target, Source)(ref Source source)
if (isInputRange!Source && isSomeChar!(ElementType!Source) && !is(Source == enum) &&
    isFloatingPoint!Target && !is(Target == enum))
{
    alias p = source;

    static immutable real[14] negtab =
        [ 1e-4096L,1e-2048L,1e-1024L,1e-512L,1e-256L,1e-128L,1e-64L,1e-32L,
                1e-16L,1e-8L,1e-4L,1e-2L,1e-1L,1.0L ];
    static immutable real[13] postab =
        [ 1e+4096L,1e+2048L,1e+1024L,1e+512L,1e+256L,1e+128L,1e+64L,1e+32L,
                1e+16L,1e+8L,1e+4L,1e+2L,1e+1L ];

    ConvException bailOut()(string msg = null, string fn = __FILE__, size_t ln = __LINE__)
    {
        if (msg == null)
            msg = "Floating point conversion error";
        return new ConvException(text(msg, " for input \"", source, "\"."), fn, ln);
    }

    enforce(!p.empty, bailOut());

    bool sign = false;
    switch (p.front)
    {
    case '-':
        sign = true;
        p.popFront();
        enforce(!p.empty, bailOut());
        if (toLower(p.front) == 'i')
            goto case 'i';
        break;
    case '+':
        p.popFront();
        enforce(!p.empty, bailOut());
        break;
    default: {}
    }

    bool isHex = false;
    bool startsWithZero = p.front == '0';
    if (startsWithZero)
    {
        p.popFront();
        if (p.empty)
        {
            return sign ? -0.0 : 0.0;
        }

        isHex = p.front == 'x' || p.front == 'X';
        if (isHex)
        {
            p.popFront();
        }
    }
    else if (toLower(p.front) == 'n')
    {
        // nan
        p.popFront();
        enforce(!p.empty && toUpper(p.front) == 'A',
               bailOut("error converting input to floating point"));
        p.popFront();
        enforce(!p.empty && toUpper(p.front) == 'N',
               bailOut("error converting input to floating point"));
        // skip past the last 'n'
        p.popFront();
        return typeof(return).nan;
    }

    /*
     * The following algorithm consists of 2 steps:
     * 1) parseDigits processes the textual input into msdec and possibly
     *    lsdec/msscale variables, followed by the exponent parser which sets
     *    exp below.
     *    Hex: input is 0xaaaaa...p+000... where aaaa is the mantissa in hex
     *    and 000 is the exponent in decimal format with base 2.
     *    Decimal: input is 0.00333...p+000... where 0.0033 is the mantissa
     *    in decimal and 000 is the exponent in decimal format with base 10.
     * 2) Convert msdec/lsdec and exp into native real format
     */

    real ldval = 0.0;
    char dot = 0;                        /* if decimal point has been seen */
    int exp = 0;
    ulong msdec = 0, lsdec = 0;
    ulong msscale = 1;
    bool sawDigits;

    enum { hex, decimal }

    // sets msdec, lsdec/msscale, and sawDigits by parsing the mantissa digits
    void parseDigits(alias FloatFormat)()
    {
        static if (FloatFormat == hex)
        {
            enum uint base = 16;
            enum ulong msscaleMax = 0x1000_0000_0000_0000UL; // largest power of 16 a ulong holds
            enum ubyte expIter = 4; // iterate the base-2 exponent by 4 for every hex digit
            alias checkDigit = isHexDigit;
            /*
             * convert letter to binary representation: First clear bit
             * to convert lower space chars to upperspace, then -('A'-10)
             * converts letter A to 10, letter B to 11, ...
             */
            alias convertDigit = (int x) => isAlpha(x) ? ((x & ~0x20) - ('A' - 10)) : x - '0';
            sawDigits = false;
        }
        else static if (FloatFormat == decimal)
        {
            enum uint base = 10;
            enum ulong msscaleMax = 10_000_000_000_000_000_000UL; // largest power of 10 a ulong holds
            enum ubyte expIter = 1; // iterate the base-10 exponent once for every decimal digit
            alias checkDigit = isDigit;
            alias convertDigit = (int x) => x - '0';
            // Used to enforce that any mantissa digits are present
            sawDigits = startsWithZero;
        }
        else
            static assert(false, "Unrecognized floating-point format used.");

        while (!p.empty)
        {
            int i = p.front;
            while (checkDigit(i))
            {
                sawDigits = true;        /* must have at least 1 digit   */

                i = convertDigit(i);

                if (msdec < (ulong.max - base)/base)
                {
                    // For base 16: Y = ... + y3*16^3 + y2*16^2 + y1*16^1 + y0*16^0
                    msdec = msdec * base + i;
                }
                else if (msscale < msscaleMax)
                {
                    lsdec = lsdec * base + i;
                    msscale *= base;
                }
                else
                {
                    exp += expIter;
                }
                exp -= dot;
                p.popFront();
                if (p.empty)
                    break;
                i = p.front;
                if (i == '_')
                {
                    p.popFront();
                    if (p.empty)
                        break;
                    i = p.front;
                }
            }
            if (i == '.' && !dot)
            {
                p.popFront();
                dot += expIter;
            }
            else
                break;
        }

        // Have we seen any mantissa digits so far?
        enforce(sawDigits, bailOut("no digits seen"));
        static if (FloatFormat == hex)
            enforce(!p.empty && (p.front == 'p' || p.front == 'P'),
                    bailOut("Floating point parsing: exponent is required"));
    }

    if (isHex)
        parseDigits!hex;
    else
        parseDigits!decimal;

    if (isHex || (!p.empty && (p.front == 'e' || p.front == 'E')))
    {
        char sexp = 0;
        int e = 0;

        p.popFront();
        enforce(!p.empty, new ConvException("Unexpected end of input"));
        switch (p.front)
        {
            case '-':    sexp++;
                         goto case;
            case '+':    p.popFront();
                         break;
            default: {}
        }
        sawDigits = false;
        while (!p.empty && isDigit(p.front))
        {
            if (e < 0x7FFFFFFF / 10 - 10)   // prevent integer overflow
            {
                e = e * 10 + p.front - '0';
            }
            p.popFront();
            sawDigits = true;
        }
        exp += (sexp) ? -e : e;
        enforce(sawDigits, new ConvException("No digits seen."));
    }

    ldval = msdec;
    if (msscale != 1)               /* if stuff was accumulated in lsdec */
        ldval = ldval * msscale + lsdec;
    if (isHex)
    {
        import core.math : ldexp;

        // Exponent is power of 2, not power of 10
        ldval = ldexp(ldval,exp);
    }
    else if (ldval)
    {
        uint u = 0;
        int pow = 4096;

        while (exp > 0)
        {
            while (exp >= pow)
            {
                ldval *= postab[u];
                exp -= pow;
            }
            pow >>= 1;
            u++;
        }
        while (exp < 0)
        {
            while (exp <= -pow)
            {
                ldval *= negtab[u];
                enforce(ldval != 0, new ConvException("Range error"));
                exp += pow;
            }
            pow >>= 1;
            u++;
        }
    }

    // if overflow occurred
    enforce(ldval != real.infinity, new ConvException("Range error"));
    return cast (Target) (sign ? -ldval : ldval);
}
10進数と16進数で処理が変わるのでそこをわけて考えてみよう。
10進数

コアのアルゴリズム以外なるべく無視する。
また、1.2のような具体的な値で考えてみる。
    static immutable real[14] negtab =
        [ 1e-4096L,1e-2048L,1e-1024L,1e-512L,1e-256L,1e-128L,1e-64L,1e-32L,
                1e-16L,1e-8L,1e-4L,1e-2L,1e-1L,1.0L ];
    static immutable real[13] postab =
        [ 1e+4096L,1e+2048L,1e+1024L,1e+512L,1e+256L,1e+128L,1e+64L,1e+32L,
                1e+16L,1e+8L,1e+4L,1e+2L,1e+1L ];

    /*
     * The following algorithm consists of 2 steps:
     * 1) parseDigits processes the textual input into msdec and possibly
     *    lsdec/msscale variables, followed by the exponent parser which sets
     *    exp below.
     *    Decimal: input is 0.00333...p+000... where 0.0033 is the mantissa
     *    in decimal and 000 is the exponent in decimal format with base 10.
     * 2) Convert msdec/lsdec and exp into native real format
     */

    real ldval = 0.0;
    char dot = 0;                        /* if decimal point has been seen */
    int exp = 0;
    ulong msdec = 0, lsdec = 0;
    ulong msscale = 1;
    bool sawDigits;

    enum { decimal }

    // sets msdec, lsdec/msscale, and sawDigits by parsing the mantissa digits
    void parseDigits(alias FloatFormat)()
    {
        enum ulong msscaleMax = 10_000_000_000_000_000_000UL; // largest power of 10 a ulong holds
        enum ubyte expIter = 1; // iterate the base-10 exponent once for every decimal digit
        alias checkDigit = isDigit;
        // Used to enforce that any mantissa digits are present
        sawDigits = startsWithZero;

        // ここに "1.2" がくることを仮定してみる
        while (!p.empty)
        {
            int i = p.front;  // asciiの数値に
            while (checkDigit(i))
            {
                sawDigits = true;        /* must have at least 1 digit   */

                i = i - '0';  // ここで整数になおす

                if (msdec < (ulong.max - 10) / 10)
                {
                    msdec = msdec * 10 + i;
                    // "1" がくるとき: msdec = 0 * 10 + 1 = 1
                    // "2" がくるとき: msdec = 1 * 10 + 2 = 12
                }
                else if (msscale < msscaleMax)
                {
                    lsdec = lsdec * 10 + i;
                    msscale *= 10;
                }
                else
                {
                    exp += expIter;
                }
                exp -= dot;  // "2" がくるとき: exp -= 1 = -1
                p.popFront();
                if (p.empty)
                    break;
                i = p.front;
            }
            if (i == '.' && !dot)  // "." がくるとき
            {
                p.popFront();
                dot += expIter; // dot = 1
            }
            else
                break;
        }
    }

    parseDigits!decimal;
    // msdec = 12
    // dot = 1
    // exp = -1

    // eかEが出現したとき。今回は無視する。
    if (!p.empty && (p.front == 'e' || p.front == 'E'))
    {
        char sexp = 0; // s-prefixはsigned(neg)の意味
        int e = 0;

        p.popFront();
        switch (p.front)
        {
            case '-':    sexp++;
                         goto case;
            case '+':    p.popFront();
                         break;
            default: {}
        }
        sawDigits = false;
        while (!p.empty && isDigit(p.front))
        {
            if (e < 0x7FFFFFFF / 10 - 10)   // prevent integer overflow
            {
                e = e * 10 + p.front - '0';
            }
            p.popFront();
            sawDigits = true;
        }
        exp += (sexp) ? -e : e;
    }

    ldval = msdec;  // ldval = 12
    if (msscale != 1)               /* if stuff was accumulated in lsdec */
        ldval = ldval * msscale + lsdec;
    uint u = 0;
    int pow = 4096;

    while (exp > 0)
    {
        while (exp >= pow)
        {
            ldval *= postab[u];
            exp -= pow;
        }
        pow >>= 1;
        u++;
    }

    // ldval = 12
    // exp = -1
    while (exp < 0)
    {
        // 一回目のループだと -1 <= -4096 でスキップ。
        // pow >>= 1するので -1 < - (pow=1) となるまでこのパスには入らない。
        // そのあいだu++するのでnegtabのインデックスが更新されていく。
        while (exp <= -pow)
        {
            ldval *= negtab[u];  // ループ外でなんやかんやあって ldval *= 1e-1L -> 12 *= 1e-1L -> 1.2 となる。
            exp += pow;  // -1 += 1 -> 0 なのでここで二重ループを抜ける。
        }
        pow >>= 1; // 4096 -> 2048
        u++; // 0 -> 1 -> 2
    }

    // if overflow occurred
    enforce(ldval != real.infinity, new ConvException("Range error"));
    return cast (Target) (sign ? -ldval : ldval);  // 最終的に1.2が返ってくるようになった。
}
というわけで元のコメントにあるように

文字列で与えられた数値のパース箇所
パースした情報を元に実際の浮動小数点数を求める箇所

の２つの主なセクションで考えることができる。
パース部分は単純に前から一文字ずつみていって数値に変換したりドットが出現したらその情報を持ったりする。
浮動小数点数を求めるところは整数表現に対してドットの場所分ずらした値に対応する指数表とかけて求めている。
"1.2"なら 12 * (10 ** -1) = 1.2 のような求め方。
Eisel-Lemire Algorithmでも小さい値の場合はこの考え方で求めている。
16進数

基本的なアルゴリズムはほとんど一緒で16進数なのでシフトする部分とかldexp使う部分が違うくらい。
    // 16進数は絶対にstartsWithZero == true
    bool startsWithZero = true; /* p.front == '0'; */
    p.popFront();
    bool isHex = true; /* p.front == 'x' || p.front == 'X'; */
    p.popFront();

    /*
     * The following algorithm consists of 2 steps:
     * 1) parseDigits processes the textual input into msdec and possibly
     *    lsdec/msscale variables, followed by the exponent parser which sets
     *    exp below.
     *    Hex: input is 0xaaaaa...p+000... where aaaa is the mantissa in hex
     *    and 000 is the exponent in decimal format with base 2.
     *    Decimal: input is 0.00333...p+000... where 0.0033 is the mantissa
     *    in decimal and 000 is the exponent in decimal format with base 10.
     * 2) Convert msdec/lsdec and exp into native real format
     */

    real ldval = 0.0;
    char dot = 0;                        /* if decimal point has been seen */
    int exp = 0;
    ulong msdec = 0, lsdec = 0;
    ulong msscale = 1;
    bool sawDigits;

    enum { hex }

    // sets msdec, lsdec/msscale, and sawDigits by parsing the mantissa digits
    void parseDigits(alias FloatFormat)()
    {
        enum ulong msscaleMax = 0x1000_0000_0000_0000UL; // largest power of 16 a ulong holds
        enum ubyte expIter = 4; // iterate the base-2 exponent by 4 for every hex digit
        alias checkDigit = isHexDigit;
        /*
         * convert letter to binary representation: First clear bit
         * to convert lower space chars to upperspace, then -('A'-10)
         * converts letter A to 10, letter B to 11, ...
         */
        alias convertDigit = (int x) => isAlpha(x) ? ((x & ~0x20) - ('A' - 10)) : x - '0';
        sawDigits = false;

        while (!p.empty)
        {
            int i = p.front;
            while (checkDigit(i))
            {
                sawDigits = true;        /* must have at least 1 digit   */

                i = convertDigit(i);

                if (msdec < (ulong.max - 16) / 16)
                {
                    // For base 16: Y = ... + y3*16^3 + y2*16^2 + y1*16^1 + y0*16^0
                    msdec = msdec * 16 + i;
                }
                else if (msscale < msscaleMax)
                {
                    lsdec = lsdec * 16 + i;
                    msscale *= 16;
                }
                else
                {
                    exp += expIter;
                }
                exp -= dot;
                p.popFront();
                if (p.empty)
                    break;
                i = p.front;
            }
            if (i == '.' && !dot)
            {
                p.popFront();
                dot += expIter;
            }
            else
                break;
        }
    }

    parseDigits!hex;

    char sexp = 0;
    int e = 0;

    p.popFront();
    switch (p.front)
    {
        case '-':    sexp++;
                     goto case;
        case '+':    p.popFront();
                     break;
        default: {}
    }
    sawDigits = false;
    while (!p.empty && isDigit(p.front))
    {
        if (e < 0x7FFFFFFF / 10 - 10)   // prevent integer overflow
        {
            e = e * 10 + p.front - '0';
        }
        p.popFront();
        sawDigits = true;
    }
    exp += (sexp) ? -e : e;

    ldval = msdec;
    if (msscale != 1)               /* if stuff was accumulated in lsdec */
        ldval = ldval * msscale + lsdec;

    // ldvalに2のexp乗をかける
    import core.math : ldexp;
    // Exponent is power of 2, not power of 10
    ldval = ldexp(ldval, exp);

    return cast (Target) (sign ? -ldval : ldval);
}