Skip to content

Instantly share code, notes, and snippets.

@zcorpan
Last active January 14, 2016 13:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zcorpan/baa697e081a3e1aa5da0 to your computer and use it in GitHub Desktop.
Save zcorpan/baa697e081a3e1aa5da0 to your computer and use it in GitHub Desktop.
<!doctype html>
<meta charset=utf-8>
<title>coords</title>
<style>
table { table-layout:fixed; width:100%; border-collapse:collapse }
td { max-width:25%; overflow:hidden; border:2px solid gray; padding:0.5em; font-family:monospace }
</style>
<table>
<tr><th>test<th>old parser<th>new parser (POC)<th>new parser (new-spec-compliant)
<script>
function parseListOfInts(input) {
var spaceCommaSemicolon = /^[ ,;]$/;
// Let input be the string being parsed.
//
// Let position be a pointer into input, initially pointing at the start of the string.
var pos = 0;
// Let numbers be an initially empty list of integers. This list will be the result of this
// algorithm.
var numbers = [];
start: while (true) {
// If there is a character in the string input at position position, and it is either a U+0020
// SPACE, U+002C COMMA, or U+003B SEMICOLON character, then advance position to the next character
// in input, or to beyond the end of the string if there are no more characters.
if (spaceCommaSemicolon.test(input[pos])) {
pos++;
}
// If position points to beyond the end of input, return numbers and abort.
if (input[pos] === undefined) {
return numbers;
}
// If the character in the string input at position position is a U+0020 SPACE, U+002C COMMA, or
// U+003B SEMICOLON character, then return to step 4.
if (spaceCommaSemicolon.test(input[pos])) {
continue start;
}
// Let negated be false.
var negated = false;
// Let value be 0.
var value = 0;
// Let started be false. This variable is set to true when the parser sees a number or a U+002D
// HYPHEN-MINUS character (-).
var started = false;
// Let got number be false. This variable is set to true when the parser sees a number.
var gotNumber = false;
// Let finished be false. This variable is set to true to switch parser into a mode where it
// ignores characters until the next separator.
var finished = false;
// Let bogus be false.
var bogus = false;
// Parser: If the character in the string input at position position is:
parser: while (true) {
inner_parser: {
var c = input[pos];
// A U+002D HYPHEN-MINUS character
if (c === '-') {
// Follow these substeps:
//
// If got number is true, let finished be true. If finished is true, skip to the next step in the
// overall set of steps. If started is true, let negated be false. Otherwise, if started is false
// and if bogus is false, let negated be true. Let started be true.
if (gotNumber) {
finished = true;
}
if (finished) {
break inner_parser;
}
if (started) {
negated = false;
} else if (!bogus) {
negated = true;
}
started = true;
}
// An ASCII digit
else if (/^\d$/.test(c)) {
// Follow these substeps:
//
// If finished is true, skip to the next step in the overall set of steps. Multiply value by ten.
// Add the value of the digit, interpreted in base ten, to value. Let started be true. Let got
// number be true.
if (finished) {
break inner_parser;
}
value *= 10;
value += parseInt(input[pos], 10);
started = true;
gotNumber = true;
} else if (spaceCommaSemicolon.test(c)) {
// Follow these substeps:
//
// If got number is false, return the numbers list and abort. This happens if an entry in the list
// has no digits, as in "1,2,x,4". If negated is true, then negate value. Append value to the
// numbers list. Jump to step 4 in the overall set of steps.
if (!gotNumber) {
return numbers;
}
if (negated) {
value *= -1;
}
numbers.push(value);
continue start;
} else if (/^[\u0001-\u001f\u0021-\u002b\u002d-\u002f\u003a\u003c-\u0040\u005b-\u0060\u007f]$/.test(c)) {
// Follow these substeps:
//
// If got number is true, let finished be true. If finished is true, skip to the next step in the
// overall set of steps. Let negated be false.
if (gotNumber) {
finished = true;
}
if (finished) {
break inner_parser;
}
negated = false;
} else {
// Follow these substeps:
//
// If finished is true, skip to the next step in the overall set of steps. Let negated be false.
// Let bogus be true. If started is true, then return the numbers list, and abort. (The value in
// value is not appended to the list first; it is dropped.)
if (finished) {
break parser;
}
negated = false;
bogus = true;
if (started) {
return numbers;
}
}
}
// Advance position to the next character in input, or to beyond the end of the string if there
// are no more characters.
pos++;
// If position points to a character (and not to beyond the end of input), jump to the big Parser
// step above.
if (input[pos] !== undefined) {
continue parser;
}
// If negated is true, then negate value.
if (negated) {
value *= -1;
}
// If got number is true, then append value to the numbers list.
if (gotNumber) {
numbers.push(value);
}
// Return the numbers list and abort.
return numbers;
}
}
}
function newCoords(input) {
var numbers = [];
// trim leading separators
input = input.replace(/^[\s,]+/, '');
// split
var tokens = input.split(/[\s,]+/);
// for each token in tokens
for (var i = 0; i < tokens.length; ++i) {
var token = tokens[i];
// replace garbage with spaces
token = token.replace(/[^\d\.-]/g, ' ');
// parse as float; add to numbers
numbers.push(parseFloat(token, 10) || 0);
}
// return numbers
return numbers;
}
function collectCharacters(input, pos, regex) {
var startPos = pos;
while (regex.test(input[pos])) {
pos++;
if (input[pos] === undefined) {
break;
}
}
return [input.substr(startPos, pos), pos];
}
function newSpecCoords(input) {
// Let input be the string being parsed.
// Let position be a pointer into input, initially pointing at the start of the
// string.
var pos = 0;
// Let numbers be an initially empty list of floating-point numbers. This list
// will be the result of this algorithm.
var numbers = [];
var unparsedNumber;
var number;
// Collect a sequence of characters that are space characters, U+002C COMMA, or
// U+003B SEMICOLON characters. This skips past any leading delimiters.
[, pos] = collectCharacters(input, pos, /^[\s,;]$/);
// While position is not past the end of input:
while (input[pos] !== undefined) {
// Collect a sequence of characters that are not space characters, U+002C COMMA,
// U+003B SEMICOLON, ASCII digits, U+002E FULL STOP, or U+002D HYPHEN-MINUS
// characters. This skips past leading garbage.
[, pos] = collectCharacters(input, pos, /^[^\s,;\d\.-]/);
// Collect a sequence of characters that are not space characters, U+002C COMMA,
// U+003B SEMICOLON characters, and let unparsed number be the result.
[unparsedNumber, pos] = collectCharacters(input, pos, /^[^\s,;]$/);
// Let number be the result using the rules for parsing floating-point number
// values for unparsed number.
number = parseFloat(unparsedNumber);
// If number is an error, let number be zero.
if (isNaN(number)) {
number = 0;
}
// Append number to numbers.
numbers.push(number);
// Collect a sequence of characters that are space characters, U+002C COMMA, or
// U+003B SEMICOLON characters. This skips past the delimiter.
[, pos] = collectCharacters(input, pos, /^[\s,;]$/);
}
// Return numbers.
return numbers;
}
var tests = [
// a few from webdevdata
"142,130,140,139,152,139,149,127,",
"138,10,13.5",
"594,72,779,72,779,142,594,142,5shop.com.tw/return/ef_return.html",
"“0,12,625,478\"",
"137,6 151,1,163,4,235,76,206,76",
"557,328,705,329,706,517,658,518,656ls/spain/holidays/regions/3/Canary+Islands/Canary+Islands.html",
"59,46,64,45,65,46,65,48,67,49,69,50,71,52,70,52,69,56,67,58,67,60,61,60,60,65,58,67,59,69,57,70,50,69,48,71,43,69,46,72,,215,6,218,3,220,2,223,1,228,2,234,4,238,6,240,6,244,10,246,14,247,18,250,23,254,27,257,32,259,34,255,33,251,35,250,38,249,38,246,40,245,40,243,37,241,37,239,38,236,37,233,36,231,38,228,39,226,39,220,37,218,35,216,35,214,37,212,38,210,38,206,35,204,31,201,28,199,25,195,23,192,19,188,13,187,10,188,6,192",
"='69,8,153,86' ", // babyneo.de
// https://lists.w3.org/Archives/Public/public-html/2009Jan/0086.html
",1,2,3,4"
];
tests.forEach(function(t) {
document.writeln('<tr><td>', esc(t), '<td>', esc(parseListOfInts(t)), '<td>', esc(newCoords(t)), '<td>', esc(newSpecCoords(t)));
});
function esc(s) {
s = String(s);
return s.replace(/\"/g, '&quot;').replace(/&/g, '&amp;');
}
</script>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment