Scraper for the dog registry in Plovdiv, Bulgaria
1. Open http://registry.plovdiv.bg/eDogs/default.aspx | |
2. Select 50 entries per page | |
3. Copy JS in console and run | |
4. Back up the downloaded data file | |
5. Extract all addresses: | |
awk -F '\t' '{print $4}' plddogs.tsv | grep -v address | sed 's_,\? \?\(ет\|ап\)\.\? \?[0-9]\+ \?__g;s_ \?№ \?_ _' |sort -u > addr | |
6. Geotag the addresses: | |
php geotag.php addr > addrg | |
7. Merge geotagged addresses with scaraped data and exclude columns: | |
for i in `sed 's_\t_|_g;s_ _\__g' plddogs.tsv | grep -v address`; | |
do c=`echo $i | sed 's_\__ _g'`; | |
echo -n `echo $c | awk -F '|' '{print $1","$2","$5","$6","}'`; | |
a=`echo $c | awk -F '|' '{print $4}'| sed 's_,\? \?\(ет\|ап\)\.\? \?[0-9]\+ \?__g;s_ \?№ \?_ _'`; | |
grep -m 1 "^$a" addrg | sed 's_.*\t__'; | |
done > data.csv | |
8. Cleanup extracted report: | |
sed -i 's_\([0-9]\{2\}\)\.\([0-9]\{2\}\)\.\([0-9]\{4\}\) г\._\3-\2-\1_;s_\([0-9]\{4\}\)-\([0-9]\{2\}\)-00_\1-\2-01_;s_\([0-9]\{4\}\)-00-\([0-9]\{2\}\)_\1-01-\2_' data.csv | |
php cleandata.php data.csv > data_clean.php | |
9. Manually replace gender and category values and fix dates. |
<?php | |
/* | |
1 кастрирано | |
2 с чип | |
3 ловно | |
4 на инвалид | |
5 от приют | |
6 служебно | |
*/ | |
/* | |
This rounds down the precision of the coordinates and displaces them by a few meters. | |
This anonymises the data and separates points that are at the same address. | |
*/ | |
$data = file_get_contents($argv[1]); | |
$data = explode("\n",trim($data)); | |
for ($i=0;$i<count($data);$i++) { | |
$data[$i]=explode(",",trim($data[$i])); | |
if (count($data[$i])!=6 && count($data[$i])!=10) die("\nerror".$i); | |
if (count($data[$i])==10) | |
$data[$i]=array($data[$i][4],$data[$i][5],$data[$i][6],$data[$i][7],$data[$i][8],$data[$i][9]); | |
$lat=$data[$i][4]; | |
$lng=$data[$i][5]; | |
$r=rand(5,50)/100000; | |
$angle = rand(0,100)/100*M_PI*2; | |
$data[$i][4]=round($lat+sin($angle)*$r,5); | |
$data[$i][5]=round($lng+cos($angle)*$r,5); | |
echo implode(",",$data[$i])."\n"; | |
} | |
?> |
<?php | |
$stopG=false; | |
$start = count($argv)>2?intval($argv[2]):0; | |
$data = file_get_contents($argv[1]); | |
$data = explode("\n",$data); | |
for ($i=$start;$i<count($data);$i++) { | |
$add=$data[$i].", Пловдив, България"; | |
$res=false; | |
if (!$stopG) { | |
$res=geocodeGoogle($add); | |
if ($res==="limit") { | |
$res=false; | |
$stopG=true; | |
} | |
} | |
if (!$res) | |
$res=geocodeOpen($add); | |
usleep(300000); | |
echo $data[$i]."\t".(!$res || count($res)==0?'ERROR':(count($res)==1?$res[0]:implode('&',$res)))."\n"; | |
} | |
function geocodeGoogle($add) { | |
$data = file_get_contents("https://maps.googleapis.com/maps/api/geocode/json?address=".urlencode(trim($add))."®ion=BG&sensor=false&key=[google api key]]"); | |
if (!$data) | |
return false; | |
$data = json_decode($data); | |
if ($data->status=="OVER_QUERY_LIMIT") | |
return "limit"; | |
if ($data->status!="OK" || !$data->results) | |
return false; | |
$res=array(); | |
foreach ($data->results as $row) { | |
$res[]=$row->geometry->location->lat.','.$row->geometry->location->lng.'|'.$row->formatted_address; | |
} | |
return $res; | |
} | |
function geocodeOpen($add) { | |
$data = file_get_contents("http://nominatim.openstreetmap.org/search?format=json&addressdetails=1&q=".urlencode(trim($add))); | |
if (!$data) | |
return false; | |
$data = json_decode($data); | |
$res=array(); | |
foreach ($data as $row) | |
$res[]=$row->lat.','.$row->lon.'|'.$row->display_name; | |
return $res; | |
} | |
?> |
function download(filename, text) { | |
var element = document.createElement('a'); | |
element.setAttribute('href', 'data:text/plain;charset=utf-8,' + encodeURIComponent(text)); | |
element.setAttribute('download', filename); | |
element.style.display = 'none'; | |
document.body.appendChild(element); | |
element.click(); | |
document.body.removeChild(element); | |
} | |
pages = parseInt($('#ctl00_ContentPlaceHolder1_lblTotalPages').text()); | |
data ="regNum\tregDate\tname\taddress\tgender\tcategory"; | |
page=1; | |
$('#ctl00_ContentPlaceHolder1_ddlPage').val(page).change(); | |
task= setInterval(function() { | |
$('.gridview tr').each( function(i) { | |
if (i==0) return; | |
var temp=""; | |
$(this).children().each(function(j,c) { | |
temp+=$(c).text().replace(/\s+/g," ").trim()+"\t"; | |
}); | |
data+="\n"+temp.substring(0,temp.length-2); | |
}); | |
if ((++page)>pages) { | |
clearInterval(task); | |
download('dogs.tsv',data); | |
} else { | |
$('#ctl00_ContentPlaceHolder1_ddlPage').val(page).change(); | |
} | |
},2000); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment