Skip to content

Instantly share code, notes, and snippets.

@ImagoTrigger
Last active April 3, 2017 01:18
Show Gist options
  • Save ImagoTrigger/46f108b445f6005711d56cea75ab576f to your computer and use it in GitHub Desktop.
Save ImagoTrigger/46f108b445f6005711d56cea75ab576f to your computer and use it in GitHub Desktop.
RouteStar data scraper
use strict;
use LWP::UserAgent::Determined;
use Geo::Coder::Google;
use Geo::Coder::MapQuest;
use Geo::Coder::Bing;
use Storable;
use File::Copy;
use Try::Tiny;
use Data::Dumper;
my $browser = LWP::UserAgent::Determined->new;
my $geocoder = Geo::Coder::Google->new(apiver => 3);
my $geocoder2 = Geo::Coder::Mapquest->new(apikey => '0XduYu3Hb9W4D5GLQb1MALJgXfxxA6Q3');
my $geocoder3 = Geo::Coder::Bing->new(key => 'AoI9z1wFi2Yw7l2bhBOFo8I---T-mPZetl9NKBWPgDU3U2gXv861q8W1a-7Km6pf');
my $count = 0;
my $found = 0;
my %ids;
my $system = $ARGV[0] || 'emminneapolis';
my $subdomain = $ARGV[1];
my $subdir = ($subdomain) ? 'app' : $system;
my $domain = ($subdomain) ? "$subdomain.mycustomerconnect.com" : "mycustomerconnect.com";
my $url = "http://$domain/$subdir/classes/customerdata.php?data=getCustomerList&page=";
print "scraping $url\n";
my %hash;
store \%hash, 'C:\\wwwroot\\maphash.txt' if (!-e 'C:\\wwwroot\\maphash.txt');
%hash = %{retrieve('C:\\wwwroot\\maphash.txt')};
$hash{runcount}++;
store \%hash, 'C:\\wwwroot\\maphash.txt';
print "run# ".$hash{runcount} . "\n";
while (1) {
$found = 0;
$count++;
my $nurl = $url . $count;
my $response = $browser->get($nurl);
my $html;
($response->is_success) ? $html = $response->decoded_content : die $response->status_line;
while($html =~ /\<li\>(.*?)\<\/li\>/g ) {
my $cust = $1;
my $name;
my $addr; my $addr2;
my $id;
if ($cust =~ /customer=(.*?)'/g) { #'
$id = $1;
}
if ($cust =~ /\<h2\>(.*?)\<\/h2\>/g) {
$name = $1;
}
if ($cust =~ /Address:\<\/strong\> (.*?)\<\/p\>\<p\>(.*?)\<\/p\>/g) {
$addr = $1; $addr2 = $2;
}
$found = 1;
my $address = "$addr $addr2";
$hash{$system}{customers}{$id}{name} = $name;
$hash{$system}{customers}{$id}{addr} = $address;
push(@{$ids{$system}},$id);
my $lat; my $long;
if (!$hash{$system}{customers}{$id}{lat} || !$hash{$system}{customers}{$id}{long}) {
my $failed;
try {
my $location = $geocoder->geocode(location => $address);
$lat = $location->{'geometry'}->{'location'}->{'lat'};
$long = $location->{'geometry'}->{'location'}->{'lng'};
sleep 1.5;
} catch {
print "google geocode API failed\n";
my $lat; my $long;
$failed = 1;
};
if ($failed || !$lat || !$long) {
try {
my $location = $geocoder2->geocode(location => $address);
$lat = $location->{'latLng'}->{'lat'};
$long = $location->{'latLng'}->{'lng'};
$failed = 0;
sleep 1.5;
} catch {
print "mapquest geocode API failed\n";
my $lat; my $long;
$failed = 1;
}
}
if ($failed || !$lat || !$long) {
try {
my $location = $geocoder3->geocode(location => $address);
$lat = $location->{'point'}->{'coordinates'}[0];
$long = $location->{'point'}->{'coordinates'}[1];
$failed = 0;
sleep 1.5;
} catch {
print "bing geocode API failed\n";
my $lat; my $long;
$failed = 1;
}
}
} else {
$lat = $hash{$system}{customers}{$id}{lat};
$long = $hash{$system}{customers}{$id}{long};
}
$hash{$system}{customers}{$id}{lat} = $lat;
$hash{$system}{customers}{$id}{long} = $long;
my $details;
my $response = $browser->get("http://$domain/$subdir/classes/customerdetaildata.php?customer=$id");
($response->is_success) ? $details = $response->decoded_content : die $response->status_line;
my $mon; my $termss; my $tax; my $email; my $rep; my $billing;
$mon = $1 if ($details =~ /\<strong\>Balance:\<\/strong\>(.*?)\<\/p\>/gi);
$email = $1 if ($details =~ /\<strong\>Email:\<\/strong\>(.*?)\<\/p\>/gi);
$billing = $1 if ($details =~ /\<strong\>Billing Address:\<\/strong\>\<\/p\>(.*?)\<br \/\>/gi);
$termss = $1 if ($details =~ /\<strong\>Terms:\<\/strong\>(.*?)\<\/p\>/gi);
$rep = $1 if ($details =~ /\<strong\>Sales Rep:\<\/strong\>(.*?)\<\/p\>/gi);
$tax = $1 if ($details =~ /\<strong\>Tax Rate:\<\/strong\>(.*?)\<\/p\>/gi);
my $invoices;
my $response = $browser->get("http://$domain/$subdir/classes/customerinfodata.php?data=invoices&listid=$id");
($response->is_success) ? $invoices = $response->decoded_content : die $response->status_line;
my $dt = 'n';
while ($invoices =~ /li data-theme=\'(.*?)\'/g) {
$dt = $1;
}
$dt = 'z' if ($name =~ /^ZZ/i);
$hash{$system}{customers}{$id}{invoices} = undef;
delete $hash{$system}{customers}{$id}{invoices};
my $ffs = 0;
my @dates;
while ($invoices =~ /(\d+)\/(\d+)\/(\d+)/g) {
push(@dates,"$1/$2/$3");
}
while ($invoices =~ /pcinvoiceid=(\d+)/g) {
my $invid = $1;
$hash{$system}{customers}{$id}{invoices}{$invid}{date} = $dates[$ffs];
my $headers;
my $response = $browser->get("http://$domain/$subdir/classes/getdata.php?data=invoiceheader&pcinvoiceid=$invid");
($response->is_success) ? $headers = $response->decoded_content : die $response->status_line;
$hash{$system}{customers}{$id}{invoices}{$invid}{notes} = $1 if ($headers =~ /\<textarea name='notes' readonly cols='100' rows='3'>(.*?)\<\/textarea\>/);
$hash{$system}{customers}{$id}{invoices}{$invid}{num} = $1 if ($headers =~ /\<strong\>Invoice \#:\<\/strong\>(.*?)\<\/p\>/gi);
my $items;
my $response = $browser->get("http://$domain/$subdir/classes/getdata.php?data=invoicelines&pcinvoiceid=$invid");
($response->is_success) ? $items = $response->decoded_content : die $response->status_line;
while ($items =~ /\<li id='(.*?)' onclick='displayitemdetails\(this.id\)'\>\<a\>\<h2\>(.*?)\<\/h2\>\<p\>(.*?)\<\/p\>\<p\>\<strong\>Del Avg:\<\/strong\>(.*?)\<strong\>Qty:\<\/strong\>(.*?)\<strong\>Rate:\<\/strong\>(.*?)\<\/p\>\<p\>\<strong\>Last Qty:\<\/strong\>(.*?)\<strong\>Last Date:\<\/strong\>(.*?)\<strong\>Fill Cap:\<\/strong\>(.*?)\<\/p\>\<p class='ui-li-aside'\>\<strong\>Total\<\/strong\>(.*?)\<\/p\>\<\/a\>\<\/li\>/gi) {
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{name} = $2;
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{desc} = $3;
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{qty} = $5;
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{rate} = $6;
}
while ($items =~ /\<li id='(.*?)' onclick='displayitemdetails\(this.id\)'\>\<a\>\<h2\>(.*?)\<\/h2\>\<p\>(.*?)\<\/p\>\<p\>\<strong\>Del Avg:\<\/strong\>(.*?)\<strong\>Qty:\<\/strong\>(.*?)\<strong\>Rate:\<\/strong\>(.*?)\<\/p\>\<p\>\<strong\>Last Qty:\<\/strong\>(.*?)\<strong\>Last Date:\<\/strong\>(.*?)\<\/p\>\<p class='ui-li-aside'\>\<strong\>Total\<\/strong\>(.*?)\<\/p\>\<\/a\>\<\/li\>/gi) {
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{name} = $2;
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{desc} = $3;
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{qty} = $5;
$hash{$system}{customers}{$id}{invoices}{$invid}{items}{$1}{rate} = $6;
}
$ffs++;
}
my $response = $browser->get("http://$domain/$subdir/classes/customerinfodata.php?data=routes&listid=$id");
my $routes;
($response->is_success) ? $routes = $response->decoded_content : die $response->status_line;
my $route = "*";
while($routes =~ /Route: \<\/strong>(.*?)\<strong\>Frequency: \<\/strong\>(.*?)\<strong\>/g) {
my $tech = $1;
my $freq = $2;
next if ($freq !~ /week/i);
$tech =~ s/\s//g;
$route = $tech;
last if ($freq =~ /[weekly|week \d]/i);
}
my $response = $browser->get("http://$domain/$subdir/classes/customerinfodata.php?data=tasks&listid=$id");
my $tasks;
($response->is_success) ? $tasks = $response->decoded_content : die $response->status_line;
my $hastask = 1;
if ($tasks =~ /no tasks in the list/i) {
$hastask = 0;
}
$hash{$system}{customers}{$id}{dt} = $dt;
$hash{$system}{customers}{$id}{route} = $route;
$hash{$system}{customers}{$id}{task} = $hastask;
$hash{$system}{customers}{$id}{balance} = $mon;
$hash{$system}{customers}{$id}{terms} = $termss;
$hash{$system}{customers}{$id}{tax} = $tax;
$hash{$system}{customers}{$id}{email} = $email;
$hash{$system}{customers}{$id}{rep} = $rep;
$hash{$system}{customers}{$id}{billing} = $billing;
print Dumper($hash{$system}{customers}{$id});
store \%hash, 'C:\\wwwroot\\maphash.txt';
}
last if !$found;
sleep 1;
}
foreach my $key (keys %{$hash{$system}{customers}}) {delete $hash{$system}{customers}{$key} if (!grep($key,@{$ids{$system}}));}
store \%hash, 'C:\\wwwroot\\maphash.txt';
copy("C:\\wwwroot\\maphash.txt","C:\\wwwroot\\maphash2.txt") or die "Copy failed: $!";
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment