Skip to content

Instantly share code, notes, and snippets.

@jimyhuang
Created May 31, 2015 10:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jimyhuang/e94f9cd5a3fd651a99ed to your computer and use it in GitHub Desktop.
Save jimyhuang/e94f9cd5a3fd651a99ed to your computer and use it in GitHub Desktop.
pet-hsz.php
<?php
$page = 0;
$output = '';
`echo '' > /tmp/animal`;
while(1) {
$url = 'http://animalprotection.hchg.gov.tw/sca/C/Animal.aspx?page='.$page;
$doc = new DOMDocument();
$new = new DOMDocument();
$html = @file_get_contents($url);
if($html){
@$doc->loadHTML('<?xml encoding="utf-8" ?>'."\n".$html);
$xpath = new DOMXPath($doc);
$div = $xpath->query('//*[@class="animal_list"]');
foreach ($div as $d){
$cloned = $d->cloneNode(TRUE);
$new->appendChild($new->importNode($cloned,TRUE));
}
$output = $new->saveHTML();
$output = mb_convert_encoding($output, 'UTF-8', 'HTML-ENTITIES');
file_put_contents("/tmp/animal", $output, FILE_APPEND);
}
else{
break;
}
$page++;
sleep(3);
}
// prase to array
$c = file_get_contents("/tmp/animal");
$c = str_replace('<div class="animal_list">', '---<div class="animal_list">', $c);
$array = explode('---', $c);
$header = array();
$rows = array();
$count = 0;
foreach($array as $a){
$a = trim($a);
if(empty($a)){
continue;
}
$count++;
preg_match('/AnimalsThumb\/(\d+)/i', $a, $matches);
$img = $matches[1] ? 'http://animalprotection.hchg.gov.tw/sca/Animals/'.$matches[1].'.JPG' : '';
preg_match('/AnimalDetail\.aspx\?id=(\d+)/i', $a, $ids);
$url_detail = !empty($ids[0]) ? 'http://animalprotection.hchg.gov.tw/sca/C/'.$ids[0] : '';
echo "Fetching ... ".$ids[1]."\n";
$animal = file_get_contents($url_detail);
if($animal){
$fields = array();
if(!isset($header['id'])){
$header['id'] = 'id';
}
$fields['id'] = $ids[1];
preg_match_all("/animal_detail_right_T'>([^<]+)<[^>]+>[^<]*<[^>]+class='animal_detail_right_C'>([^<]+)/i", $animal, $matches);
foreach($matches[1] as $k => $m){
if(!empty($m)){
$label = str_replace(":", '', $m);
if(!isset($header[$label])){
$header[$label] = $label;
}
$value = trim(str_replace('&nbsp;', '', $matches[2][$k]));
$fields[$label] = $value;
}
}
if(!isset($header['圖片'])){
$header['圖片'] = '圖片';
}
$fields['圖片'] = $img;
$rows[] = $fields;
unset($fields);
}
sleep(3);
}
array_unshift($rows, $header);
/**
* Build csv
* Install: composer require league/csv
*/
require 'vendor/autoload.php';
use League\Csv\Writer;
$csv = Writer::createFromFileObject(new SplTempFileObject());
$csv->setEnclosure('"');
$csv->insertAll($rows);
$file = $csv->__toString();
file_put_contents('pet-hsz.csv', $file);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment