Created
July 6, 2020 16:01
-
-
Save vyuh/75e27ed631da531b7570462ac90f2e8b to your computer and use it in GitHub Desktop.
QUESTION ON STATS.PL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
NUMBERS as JSON formatted strings may be very big -> | |
JSON::PP allow_bignum decode -> | |
List::Util sum0. &stats -> | |
JSON::PP allow_bignum encode -> | |
NUMBERS ENCODED AS STRINGS. WRONG CALCULATED VALUES DUE TO OVERFLOWS. | |
I can tolerate imprecise calculation of mean and variance, | |
but not entirely inaccurate result or numerical result encoded as string.:w | |
I am using a function to calculate statistics for a list of numbers. | |
These numbers may be big enough to lose precision if stored as normal perl numbers. | |
I recieve such numbers as JSON formatted strings. | |
To decode these strings without losing precision, | |
I use a `JSON::PP` object with `allow_nonref` and `allow_bignum` activated. | |
I send the list of such decoded numbers to `stats` subroutine | |
(see in code shown below). | |
This rourine caluculates some statistics. | |
These statistics are then encoded to JSON and saved to file. | |
Most of the time the process seems to work correctly, but | |
for some inputs (see code for example) | |
there the calculated value of mean and variance stattistics | |
are either clearly wrong, or are encoded as JSON strings by the encoder or both. | |
I suspect this is due to interaction of `Math::BigInt`, `Math::BigFloat`, | |
and `List::Util::sum0`. | |
I am trying to figure out what causes this and a way to avoid/fix this. | |
I am willing to accept imprecise calculation of mean and variance, | |
but not entirely inaccurate results | |
or numerical results encoded as string in JSON. | |
A sript to demonstrate the problem: | |
use strict; | |
use warnings; | |
use Data::Dumper; | |
$Data::Dumper::Varname = "DUMPED_RAWDATA"; | |
use JSON::PP; | |
use List::Util; | |
my $JSON = JSON::PP->new->allow_bignum->utf8->pretty->canonical; | |
sub stats { | |
#TODO fix bug about negative variance. AVOID OVERFLOW | |
#TODO use GMP, XS? | |
my $n = scalar @_; | |
my $sum = List::Util::sum0(@_); | |
my $mean = $sum / $n; | |
my $var = List::Util::sum0( map { $_**2 } @_ ) / $n - $mean**2; | |
my $s = { | |
n => $n, | |
sum => $sum, | |
max => List::Util::max(@_), | |
min => List::Util::min(@_), | |
mean => $mean, | |
variance => $var | |
}; | |
# DUMP STATE IF SOME ERROR OCCURS | |
print Dumper( \@_ ), | |
$JSON->encode( { json_encoded_stats => $s, json_encoded_rawdata => \@_ } ) | |
if ( '"' eq substr( $JSON->encode($var), 0, 1 ) #MEAN ENCODED AS STRING | |
or '"' eq substr( $JSON->encode($mean), 0, 1 ) #VARIANCE ENCODED AS STRING | |
or $var < 0 ); #VARIANCE IS NEGATIVE! | |
$s; | |
} | |
my @test = ( | |
[ | |
qw( 919300112739897344 919305709216464896 919305709216464896 985592115567603712 959299136196456448) | |
], | |
[qw(479655558 429035600 3281034608 3281034608 2606592908 3490045576)], | |
[ qw(914426431563644928) x 3142 ] | |
); | |
for (@test) { | |
print "---\n"; | |
stats( map { $JSON->decode($_) } @$_ ); | |
} | |
Below is the curtailed output of `perl stats.pl`. | |
--- | |
$DUMPED_RAWDATA1 = [ | |
'919300112739897344', | |
'919305709216464896', | |
'919305709216464896', | |
'985592115567603712', | |
'959299136196456448' | |
]; | |
{ | |
"json_encoded_rawdata" : [ | |
919300112739897344, | |
919305709216464896, | |
919305709216464896, | |
985592115567603712, | |
959299136196456448 | |
], | |
"json_encoded_stats" : { | |
"max" : 985592115567603712, | |
"mean" : "9.40560556587377e+17", | |
"min" : 919300112739897344, | |
"n" : 5, | |
"sum" : 4702802782936887296, | |
"variance" : 7.46903843214008e+32 | |
} | |
} | |
--- | |
$DUMPED_RAWDATA1 = [ | |
479655558, | |
429035600, | |
3281034608, | |
3281034608, | |
2606592908, | |
3490045576 | |
]; | |
{ | |
"json_encoded_rawdata" : [ | |
479655558, | |
429035600, | |
3281034608, | |
3281034608, | |
2606592908, | |
3490045576 | |
], | |
"json_encoded_stats" : { | |
"max" : 3490045576, | |
"mean" : 2261233143, | |
"min" : 429035600, | |
"n" : 6, | |
"sum" : 13567398858, | |
"variance" : "-1.36775568782523e+18" | |
} | |
} | |
--- | |
$DUMPED_RAWDATA1 = [ | |
'914426431563644928', | |
. | |
. | |
. | |
<snip 3140 identical lines> | |
'914426431563644928' | |
]; | |
{ | |
"json_encoded_rawdata" : [ | |
914426431563644928, | |
. | |
. | |
. | |
<snip 3140 identical lines> | |
914426431563644928 | |
], | |
"json_encoded_stats" : { | |
"max" : 914426431563644928, | |
"mean" : "9.14426431563676e+17", | |
"min" : 914426431563644928, | |
"n" : 3142, | |
"sum" : 2.87312784797307e+21, | |
"variance" : -9.75463826617761e+22 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment