Skip to content

Instantly share code, notes, and snippets.

@yurukov
Last active February 8, 2016 11:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yurukov/2c211c2151eaeb5a403e to your computer and use it in GitHub Desktop.
Save yurukov/2c211c2151eaeb5a403e to your computer and use it in GitHub Desktop.
Scraper for the dog registry in Plovdiv, Bulgaria
1. Open http://registry.plovdiv.bg/eDogs/default.aspx
2. Select 50 entries per page
3. Copy JS in console and run
4. Back up the downloaded data file
5. Extract all addresses:
awk -F '\t' '{print $4}' plddogs.tsv | grep -v address | sed 's_,\? \?\(ет\|ап\)\.\? \?[0-9]\+ \?__g;s_ \?№ \?_ _' |sort -u > addr
6. Geotag the addresses:
php geotag.php addr > addrg
7. Merge geotagged addresses with scaraped data and exclude columns:
for i in `sed 's_\t_|_g;s_ _\__g' plddogs.tsv | grep -v address`;
do c=`echo $i | sed 's_\__ _g'`;
echo -n `echo $c | awk -F '|' '{print $1","$2","$5","$6","}'`;
a=`echo $c | awk -F '|' '{print $4}'| sed 's_,\? \?\(ет\|ап\)\.\? \?[0-9]\+ \?__g;s_ \?№ \?_ _'`;
grep -m 1 "^$a" addrg | sed 's_.*\t__';
done > data.csv
8. Cleanup extracted report:
sed -i 's_\([0-9]\{2\}\)\.\([0-9]\{2\}\)\.\([0-9]\{4\}\) г\._\3-\2-\1_;s_\([0-9]\{4\}\)-\([0-9]\{2\}\)-00_\1-\2-01_;s_\([0-9]\{4\}\)-00-\([0-9]\{2\}\)_\1-01-\2_' data.csv
php cleandata.php data.csv > data_clean.php
9. Manually replace gender and category values and fix dates.
<?php
/*
1 кастрирано
2 с чип
3 ловно
4 на инвалид
5 от приют
6 служебно
*/
/*
This rounds down the precision of the coordinates and displaces them by a few meters.
This anonymises the data and separates points that are at the same address.
*/
$data = file_get_contents($argv[1]);
$data = explode("\n",trim($data));
for ($i=0;$i<count($data);$i++) {
$data[$i]=explode(",",trim($data[$i]));
if (count($data[$i])!=6 && count($data[$i])!=10) die("\nerror".$i);
if (count($data[$i])==10)
$data[$i]=array($data[$i][4],$data[$i][5],$data[$i][6],$data[$i][7],$data[$i][8],$data[$i][9]);
$lat=$data[$i][4];
$lng=$data[$i][5];
$r=rand(5,50)/100000;
$angle = rand(0,100)/100*M_PI*2;
$data[$i][4]=round($lat+sin($angle)*$r,5);
$data[$i][5]=round($lng+cos($angle)*$r,5);
echo implode(",",$data[$i])."\n";
}
?>
<?php
$stopG=false;
$start = count($argv)>2?intval($argv[2]):0;
$data = file_get_contents($argv[1]);
$data = explode("\n",$data);
for ($i=$start;$i<count($data);$i++) {
$add=$data[$i].", Пловдив, България";
$res=false;
if (!$stopG) {
$res=geocodeGoogle($add);
if ($res==="limit") {
$res=false;
$stopG=true;
}
}
if (!$res)
$res=geocodeOpen($add);
usleep(300000);
echo $data[$i]."\t".(!$res || count($res)==0?'ERROR':(count($res)==1?$res[0]:implode('&',$res)))."\n";
}
function geocodeGoogle($add) {
$data = file_get_contents("https://maps.googleapis.com/maps/api/geocode/json?address=".urlencode(trim($add))."&region=BG&sensor=false&key=[google api key]]");
if (!$data)
return false;
$data = json_decode($data);
if ($data->status=="OVER_QUERY_LIMIT")
return "limit";
if ($data->status!="OK" || !$data->results)
return false;
$res=array();
foreach ($data->results as $row) {
$res[]=$row->geometry->location->lat.','.$row->geometry->location->lng.'|'.$row->formatted_address;
}
return $res;
}
function geocodeOpen($add) {
$data = file_get_contents("http://nominatim.openstreetmap.org/search?format=json&addressdetails=1&q=".urlencode(trim($add)));
if (!$data)
return false;
$data = json_decode($data);
$res=array();
foreach ($data as $row)
$res[]=$row->lat.','.$row->lon.'|'.$row->display_name;
return $res;
}
?>
function download(filename, text) {
var element = document.createElement('a');
element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text));
element.setAttribute('download', filename);
element.style.display = 'none';
document.body.appendChild(element);
element.click();
document.body.removeChild(element);
}
pages = parseInt($('#ctl00_ContentPlaceHolder1_lblTotalPages').text());
data ="regNum\tregDate\tname\taddress\tgender\tcategory";
page=1;
$('#ctl00_ContentPlaceHolder1_ddlPage').val(page).change();
task= setInterval(function() {
$('.gridview tr').each( function(i) {
if (i==0) return;
var temp="";
$(this).children().each(function(j,c) {
temp+=$(c).text().replace(/\s+/g," ").trim()+"\t";
});
data+="\n"+temp.substring(0,temp.length-2);
});
if ((++page)>pages) {
clearInterval(task);
download('dogs.tsv',data);
} else {
$('#ctl00_ContentPlaceHolder1_ddlPage').val(page).change();
}
},2000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment