Last active
April 3, 2017 01:18
-
-
Save ImagoTrigger/46f108b445f6005711d56cea75ab576f to your computer and use it in GitHub Desktop.
RouteStar data scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use LWP::UserAgent::Determined; | |
use Geo::Coder::Google; | |
use Geo::Coder::MapQuest; | |
use Geo::Coder::Bing; | |
use Storable; | |
use File::Copy; | |
use Try::Tiny; | |
use Data::Dumper; | |
my $browser = LWP::UserAgent::Determined->new; | |
my $geocoder = Geo::Coder::Google->new(apiver => 3); | |
my $geocoder2 = Geo::Coder::Mapquest->new(apikey => '0XduYu3Hb9W4D5GLQb1MALJgXfxxA6Q3'); | |
my $geocoder3 = Geo::Coder::Bing->new(key => 'AoI9z1wFi2Yw7l2bhBOFo8I---T-mPZetl9NKBWPgDU3U2gXv861q8W1a-7Km6pf'); | |
my $count = 0; | |
my $found = 0; | |
my %ids; | |
my $system = $ARGV[0] || 'emminneapolis'; | |
my $subdomain = $ARGV[1]; | |
my $subdir = ($subdomain) ? 'app' : $system; | |
my $domain = ($subdomain) ? "$subdomain.mycustomerconnect.com" : "mycustomerconnect.com"; | |
my $url = "http://$domain/$subdir/classes/customerdata.php?data=getCustomerList&page="; | |
print "scraping $url\n"; | |
my %hash; | |
store \%hash, 'C:\\wwwroot\\maphash.txt' if (!-e 'C:\\wwwroot\\maphash.txt'); | |
%hash = %{retrieve('C:\\wwwroot\\maphash.txt')}; | |
$hash{runcount}++; | |
store \%hash, 'C:\\wwwroot\\maphash.txt'; | |
print "run# ".$hash{runcount} . "\n"; | |
while (1) { | |
$found = 0; | |
$count++; | |
my $nurl = $url . $count; | |
my $response = $browser->get($nurl); | |
my $html; | |
($response->is_success) ? $html = $response->decoded_content : die $response->status_line; | |
while($html =~ /\<li\>(.*?)\<\/li\>/g ) { | |
my $cust = $1; | |
my $name; | |
my $addr; my $addr2; | |
my $id; | |
if ($cust =~ /customer=(.*?)'/g) { #' | |
$id = $1; | |
} | |
if ($cust =~ /\<h2\>(.*?)\<\/h2\>/g) { | |
$name = $1; | |
} | |
if ($cust =~ /Address:\<\/strong\> (.*?)\<\/p\>\<p\>(.*?)\<\/p\>/g) { | |
$addr = $1; $addr2 = $2; | |
} | |
$found = 1; | |
my $address = "$addr $addr2"; | |
$hash{$system}{customers}{$id}{name} = $name; | |
$hash{$system}{customers}{$id}{addr} = $address; | |
push(@{$ids{$system}},$id); | |
my $lat; my $long; | |
if (!$hash{$system}{customers}{$id}{lat} || !$hash{$system}{customers}{$id}{long}) { | |
my $failed; | |
try { | |
my $location = $geocoder->geocode(location => $address); | |
$lat = $location->{'geometry'}->{'location'}->{'lat'}; | |
$long = $location->{'geometry'}->{'location'}->{'lng'}; | |
sleep 1.5; | |
} catch { | |
print "google geocode API failed\n"; | |
my $lat; my $long; | |
$failed = 1; | |
}; | |
if ($failed || !$lat || !$long) { | |
try { | |
my $location = $geocoder2->geocode(location => $address); | |
$lat = $location->{'latLng'}->{'lat'}; | |
$long = $location->{'latLng'}->{'lng'}; | |
$failed = 0; | |
sleep 1.5; | |
} catch { | |
print "mapquest geocode API failed\n"; | |
my $lat; my $long; | |
$failed = 1; | |
} | |
} | |
if ($failed || !$lat || !$long) { | |
try { | |
my $location = $geocoder3->geocode(location => $address); | |
$lat = $location->{'point'}->{'coordinates'}[0]; | |
$long = $location->{'point'}->{'coordinates'}[1]; | |
$failed = 0; | |
sleep 1.5; | |
} catch { | |
print "bing geocode API failed\n"; | |
my $lat; my $long; | |
$failed = 1; | |
} | |
} | |
} else { | |
$lat = $hash{$system}{customers}{$id}{lat}; | |
$long = $hash{$system}{customers}{$id}{long}; | |
} | |
$hash{$system}{customers}{$id}{lat} = $lat; | |
$hash{$system}{customers}{$id}{long} = $long; | |
my $details; | |
my $response = $browser->get("http://$domain/$subdir/classes/customerdetaildata.php?customer=$id"); | |
($response->is_success) ? $details = $response->decoded_content : die $response->status_line; | |
my $mon; my $termss; my $tax; my $email; my $rep; my $billing; | |
$mon = $1 if ($details =~ /\<strong\>Balance:\<\/strong\>(.*?)\<\/p\>/gi); | |
$email = $1 if ($details =~ /\<strong\>Email:\<\/strong\>(.*?)\<\/p\>/gi); | |
$billing = $1 if ($details =~ /\<strong\>Billing Address:\<\/strong\>\<\/p\>(.*?)\<br \/\>/gi); | |
$termss = $1 if ($details =~ /\<strong\>Terms:\<\/strong\>(.*?)\<\/p\>/gi); | |
$rep = $1 if ($details =~ /\<strong\>Sales Rep:\<\/strong\>(.*?)\<\/p\>/gi); | |
$tax = $1 if ($details =~ /\<strong\>Tax Rate:\<\/strong\>(.*?)\<\/p\>/gi); | |
my $invoices; | |
my $response = $browser->get("http://$domain/$subdir/classes/customerinfodata.php?data=invoices&listid=$id"); | |
($response->is_success) ? $invoices = $response->decoded_content : die $response->status_line; | |
my $dt = 'n'; | |
while ($invoices =~ /li data-theme=\'(.*?)\'/g) { | |
$dt = $1; | |
} | |
$dt = 'z' if ($name =~ /^ZZ/i); | |
$hash{$system}{customers}{$id}{invoices} = undef; | |
delete $hash{$system}{customers}{$id}{invoices}; | |
my $ffs = 0; | |
my @dates; | |
while ($invoices =~ /(\d+)\/(\d+)\/(\d+)/g) { | |
push(@dates,"$1/$2/$3"); | |
} | |
while ($invoices =~ /pcinvoiceid=(\d+)/g) { | |
my $invid = $1; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{date} = $dates[$ffs]; | |
my $headers; | |
my $response = $browser->get("http://$domain/$subdir/classes/getdata.php?data=invoiceheader&pcinvoiceid=$invid"); | |
($response->is_success) ? $headers = $response->decoded_content : die $response->status_line; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{notes} = $1 if ($headers =~ /\<textarea name='notes' readonly cols='100' rows='3'>(.*?)\<\/textarea\>/); | |
$hash{$system}{customers}{$id}{invoices}{$invid}{num} = $1 if ($headers =~ /\<strong\>Invoice \#:\<\/strong\>(.*?)\<\/p\>/gi); | |
my $items; | |
my $response = $browser->get("http://$domain/$subdir/classes/getdata.php?data=invoicelines&pcinvoiceid=$invid"); | |
($response->is_success) ? $items = $response->decoded_content : die $response->status_line; | |
while ($items =~ /\<li id='(.*?)' onclick='displayitemdetails\(this.id\)'\>\<a\>\<h2\>(.*?)\<\/h2\>\<p\>(.*?)\<\/p\>\<p\>\<strong\>Del Avg:\<\/strong\>(.*?)\<strong\>Qty:\<\/strong\>(.*?)\<strong\>Rate:\<\/strong\>(.*?)\<\/p\>\<p\>\<strong\>Last Qty:\<\/strong\>(.*?)\<strong\>Last Date:\<\/strong\>(.*?)\<strong\>Fill Cap:\<\/strong\>(.*?)\<\/p\>\<p class='ui-li-aside'\>\<strong\>Total\<\/strong\>(.*?)\<\/p\>\<\/a\>\<\/li\>/gi) { | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{name} = $2; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{desc} = $3; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{qty} = $5; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{rate} = $6; | |
} | |
while ($items =~ /\<li id='(.*?)' onclick='displayitemdetails\(this.id\)'\>\<a\>\<h2\>(.*?)\<\/h2\>\<p\>(.*?)\<\/p\>\<p\>\<strong\>Del Avg:\<\/strong\>(.*?)\<strong\>Qty:\<\/strong\>(.*?)\<strong\>Rate:\<\/strong\>(.*?)\<\/p\>\<p\>\<strong\>Last Qty:\<\/strong\>(.*?)\<strong\>Last Date:\<\/strong\>(.*?)\<\/p\>\<p class='ui-li-aside'\>\<strong\>Total\<\/strong\>(.*?)\<\/p\>\<\/a\>\<\/li\>/gi) { | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{name} = $2; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{desc} = $3; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{qty} = $5; | |
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{rate} = $6; | |
} | |
$ffs++; | |
} | |
my $response = $browser->get("http://$domain/$subdir/classes/customerinfodata.php?data=routes&listid=$id"); | |
my $routes; | |
($response->is_success) ? $routes = $response->decoded_content : die $response->status_line; | |
my $route = "*"; | |
while($routes =~ /Route: \<\/strong>(.*?)\<strong\>Frequency: \<\/strong\>(.*?)\<strong\>/g) { | |
my $tech = $1; | |
my $freq = $2; | |
next if ($freq !~ /week/i); | |
$tech =~ s/\s//g; | |
$route = $tech; | |
last if ($freq =~ /[weekly|week \d]/i); | |
} | |
my $response = $browser->get("http://$domain/$subdir/classes/customerinfodata.php?data=tasks&listid=$id"); | |
my $tasks; | |
($response->is_success) ? $tasks = $response->decoded_content : die $response->status_line; | |
my $hastask = 1; | |
if ($tasks =~ /no tasks in the list/i) { | |
$hastask = 0; | |
} | |
$hash{$system}{customers}{$id}{dt} = $dt; | |
$hash{$system}{customers}{$id}{route} = $route; | |
$hash{$system}{customers}{$id}{task} = $hastask; | |
$hash{$system}{customers}{$id}{balance} = $mon; | |
$hash{$system}{customers}{$id}{terms} = $termss; | |
$hash{$system}{customers}{$id}{tax} = $tax; | |
$hash{$system}{customers}{$id}{email} = $email; | |
$hash{$system}{customers}{$id}{rep} = $rep; | |
$hash{$system}{customers}{$id}{billing} = $billing; | |
print Dumper($hash{$system}{customers}{$id}); | |
store \%hash, 'C:\\wwwroot\\maphash.txt'; | |
} | |
last if !$found; | |
sleep 1; | |
} | |
foreach my $key (keys %{$hash{$system}{customers}}) {delete $hash{$system}{customers}{$key} if (!grep($key,@{$ids{$system}}));} | |
store \%hash, 'C:\\wwwroot\\maphash.txt'; | |
copy("C:\\wwwroot\\maphash.txt","C:\\wwwroot\\maphash2.txt") or die "Copy failed: $!"; | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment