Last active
August 29, 2015 14:11
-
-
Save pramsey/9497206ee0e282fdd651 to your computer and use it in GitHub Desktop.
Brittle perl for parsing SCOTUS coordinates into WKT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use Switch; | |
# Download the SCOTUS decision from here | |
# http://www.supremecourt.gov/opinions/14pdf/no5orig_d18e.pdf | |
# Convert it from PDF to a raw text form using this service | |
# http://convertonlinefree.com/PDFToTXTEN.aspx | |
# Then run this script on it | |
my $first = 1; | |
my @curve = (); | |
my @cmd = (); | |
my @x = (); | |
my @y = (); | |
my $count = 1; | |
while(<>) | |
{ | |
# Remove Carriage-returns | |
s/\r//g; | |
# Skip white-space-only lines | |
next if( ! /\S/ ); | |
# Strip duplicative white space | |
s/^\s*//g; | |
s/\t/ /g; | |
s/ +/ /g; | |
# Tokenize the command strings */ | |
s/BEGINNING AT/BEGINNING_AT/g; | |
s/BY ARC CENTERED AT/BY_ARC_CENTERED_AT/g; | |
s/BY STRAIGHT LINE TO/BY_STRAIGHT_LINE_TO/g; | |
my @commentary = (); | |
if ( @cmd && @x && @y ) | |
{ | |
@cmd = (); | |
@x = (); | |
@y = (); | |
} | |
# Key trick: Once we've changed the command tokens into single words | |
# we can split the text up on spaces to get nice tokens ready for | |
# further processing. | |
my @line = split(/ /); | |
# Convert each line of the raw text into three lists: one of command | |
# tokens, one of x values and one of y values. | |
# Fortunately UTM x/y values are so different, we can use simple magnitude to | |
# distinguish them (y's are > 1 million) | |
foreach my $l (@line) | |
{ | |
switch ( $l ) | |
{ | |
case "BEGINNING_AT" { push(@cmd, $l); } | |
case "BY_ARC_CENTERED_AT" { push(@cmd, $l); } | |
case "BY_STRAIGHT_LINE_TO" { push(@cmd, $l); } | |
case "TO" { push(@cmd, $l); } | |
else { | |
if ( $l =~ /^\d+\.\d+/ ) | |
{ | |
if ( $l > 1000000 ) | |
{ | |
push(@y, $l); | |
} | |
else | |
{ | |
push(@x, $l); | |
} | |
} | |
else | |
{ | |
push(@commentary, $l); | |
} | |
} | |
} | |
} | |
# printf "RAW %s\n", $_; | |
# whenever we have all three buffers full and of the same size | |
# that means we're ready to empty them out again and process | |
# the next raw line of stuff | |
if ( @cmd && @x && @y ) | |
{ | |
# printf "COMMANDS: %s\n", join(" ", @cmd); | |
# printf " X: %s\n", join(" ", @x); | |
# printf " Y: %s\n", join(" ", @y); | |
my $csz = @cmd; | |
my $xsz = @x; | |
my $ysz = @y; | |
# if ( ! (($csz == $xsz) && ($csz == $ysz)) ) | |
# { | |
# printf " ERROR: commands=%d x=%d y=%d\n", $csz, $xsz, $ysz; | |
# printf " RAW: %s\n", $_; | |
# } | |
# | |
# Handle the inevitable special case, for one line string they ended an | |
# arc, not on a coordinate but on "the intersection with the | |
# California-Oregon state lateral boundary" | |
# In that case we throw away the terminal arc, sorry. Real | |
# programmers can go and research what that coordinate actually is. | |
my $nrecords = @x; | |
$nrecords-- if ( @cmd > @x ) ; | |
# Convert three arrays of tokens/coordinates into a single array with | |
# one entry for each command/x/y combination. | |
for ( my $i = 0; $i < $nrecords; $i++ ) | |
{ | |
if ( @cmd[$i] eq "BEGINNING_AT" && ! $first ) | |
{ | |
&output(\@curve); | |
@curve = (); | |
printf "\n\n"; | |
} | |
else | |
{ | |
$first = 0; | |
} | |
push(@curve, { cmd => $cmd[$i], x => $x[$i], y => $y[$i] } ); | |
} | |
} | |
} | |
&output(\@curve); | |
exit; | |
###################################################################### | |
# Taking in an array of (cmd/y/x) records, output a COMPOUNDCURVE | |
# for that array. | |
sub output | |
{ | |
$curve = shift(); | |
# foreach $cmd (@$curve) | |
# { | |
# printf "%s %s %s\n", $cmd->{cmd}, $cmd->{x}, $cmd->{y}; | |
# } | |
my $i = 1; | |
@sub = (); | |
my $j = 0; | |
# Really ugly magic to deal with the coordinate systems. The first object | |
# in the scotus decision is UTM 11, the next three UTM 10, the final ones | |
# back to UTM 11 | |
my $srid=26911; | |
$srid = 26910 if ( $count >= 2 && $count <= 4 ); | |
$count++; | |
printf "INSERT INTO causa (geom) VALUES ('SRID=%d;COMPOUNDCURVE(\n", $srid; | |
# The WKT for a compound curve has duplicated vertices in places | |
# where the CIRCULARSTRINGS join up to the LINESTRINGS, so we have | |
# to turn our mixed list of commands into homogenous collections of | |
# just lines and just arcs. | |
while ( $i < @$curve ) | |
{ | |
my $c0 = $curve->[$i-1]; | |
my $c1 = $curve->[$i]; | |
my $c2 = $curve->[$i+1]; | |
if ( $c1->{cmd} eq "BY_ARC_CENTERED_AT" ) | |
{ | |
if ( $c0->{cmd} ne "TO" ) | |
{ | |
print ",\n" if $j++ > 1; | |
&output_sub(\@sub); | |
@sub = (); | |
push(@sub, $c0); | |
push(@sub, $c1); | |
push(@sub, $c2); | |
} | |
else | |
{ | |
push(@sub, $c1); | |
push(@sub, $c2); | |
} | |
$i += 2; | |
next; | |
} | |
if ( $c1->{cmd} eq "BY_STRAIGHT_LINE_TO" ) | |
{ | |
if ( $c0->{cmd} ne "BY_STRAIGHT_LINE_TO" ) | |
{ | |
print ",\n" if $j++ > 1; | |
&output_sub(\@sub); | |
@sub = (); | |
push(@sub, $c0); | |
push(@sub, $c1); | |
} | |
else | |
{ | |
push(@sub, $c1); | |
} | |
$i += 1; | |
next; | |
} | |
} | |
print ",\n" if $j++ > 1; | |
&output_sub(\@sub); | |
print ")');\n"; | |
return; | |
} | |
# Taking in an homogenious (all arcs or all line) array of cmd/x/y | |
# output the appropriate WKT object | |
sub output_sub | |
{ | |
my $sub = shift(); | |
my @a = @$sub; | |
my $type = ""; | |
my $i = 0; | |
return if ! @a; | |
if ( $a[1]->{cmd} eq "BY_ARC_CENTERED_AT" ) | |
{ | |
$type = "CIRCULARSTRING"; | |
} | |
printf "%s(", $type; | |
for ( my $i = 0; $i < @a; $i++ ) | |
{ | |
print "," if $i; | |
# Convert center point to mid-arc point | |
if ( $type eq "CIRCULARSTRING" && $i % 2 ) | |
{ | |
my $x0 = @a[$i-1]->{x}; | |
my $y0 = @a[$i-1]->{y}; | |
my $x1 = @a[$i]->{x}; | |
my $y1 = @a[$i]->{y}; | |
my $x2 = @a[$i+1]->{x}; | |
my $y2 = @a[$i+1]->{y}; | |
$x0 -= $x1; | |
$y0 -= $y1; | |
$x2 -= $x1; | |
$y2 -= $y1; | |
my $x1n = ($x0 + $x2)/2; | |
my $y1n = ($y0 + $y2)/2; | |
my $r1 = sqrt($x1n**2 + $y1n**2); | |
$x1n /= $r1; | |
$y1n /= $r1; | |
my $r = sqrt($x0**2 + $y0**2); | |
$x1n *= $r; | |
$y1n *= $r; | |
$x1n += $x1; | |
$y1n += $y1; | |
@a[$i]->{x} = $x1n; | |
@a[$i]->{y} = $y1n; | |
} | |
printf "%s %s", @a[$i]->{x}, @a[$i]->{y}; | |
} | |
printf ")"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment