Skip to content

Instantly share code, notes, and snippets.

@sters
Created December 5, 2017 03:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sters/1fe2e7f3dac8db0bfe0eb3fce989ea94 to your computer and use it in GitHub Desktop.
Save sters/1fe2e7f3dac8db0bfe0eb3fce989ea94 to your computer and use it in GitHub Desktop.
easy scraping kit
class Console
{
public static $resultFile = '';
public static function init()
{
preg_match('@^(.+?)\.php@', __FILE__, $matches);
static::$resultFile = __DIR__ . '/result_' . basename($matches[1]) . '.csv';
}
public static function out($str, $indent = 0)
{
if ($indent > 0) {
$str = str_repeat(' ', $indent) . $str;
}
echo "{$str}\n";
}
public static function getListFile($file)
{
$listFile = file_get_contents($file);
$listFile = explode("\n", $listFile);
foreach ($listFile as $item) {
if (strlen(trim($item)) === 0) {
continue;
}
yield explode(',', $item);
}
}
public static function outputResult($data)
{
if (is_array($data)) {
foreach ($data as $k => $v) {
$data[$k] = '"' . str_replace('"', "'", $v) . '"';
}
$data = implode(',', $data);
}
file_put_contents(self::$resultFile, $data . "\n", FILE_APPEND);
}
public static function sleep($t)
{
sleep($t);
}
public static function msleep($t)
{
usleep($t * 1000);
}
public static function usleep($t)
{
usleep($t);
}
}
Console::init();
class HTMLDoc
{
public $raw;
public $doc;
public static $waitMin = 2000;
public static $waitMax = 4000;
public function __construct($html = '', $subCharset = 'utf-8')
{
$this->doc = new DOMDocument();
$this->doc->preserveWhiteSpace = false;
if (strpos($html, 'charset=') === false || strpos($html, 'charset=') >= strlen($html) * 0.1) {
$html = preg_replace('@<head>@i', '<head><meta http-equiv="Content-Type" content="' . $subCharset . '">', $html);
}
$html = preg_replace('@<meta charset="UTF-8">@i', '<meta charset="UTF-8"><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">', $html);
$html = preg_replace('@<meta charset="UTF-8"/>@i', '<meta charset="UTF-8"><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">', $html);
$html = preg_replace('@<meta http-equiv="Content-Type" content="text/html; charset=Windows-31J">@i', '<meta http-equiv="Content-Type" content="text/html; charset=CP932">', $html);
$html = preg_replace('@<meta http-equiv="Content-Type" content="text/html; charset=shift_jis">@i', '<meta http-equiv="Content-Type" content="text/html; charset=CP932">', $html);
@$this->doc->loadHTML($html);
$this->raw = $html;
}
public static function loadURL($url, $options = [])
{
static $waitTime;
if (! empty($waitTime)) {
if (($waitTime - microtime(true)) > 0) {
Console::msleep(($waitTime - microtime(true)));
}
}
$ch = curl_init();
curl_setopt_array($ch, [
CURLOPT_URL => $url,
CURLOPT_HEADER => false,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
CURLOPT_SSL_VERIFYPEER => false,
] + $options);
$response = curl_exec($ch);
$err = curl_error($ch);
if (! empty($err)) {
curl_close($ch);
throw new Exception($err);
}
$info = curl_getinfo($ch);
curl_close($ch);
$header = get_headers($url, true);
$contentType = empty($header['Content-Type']) ? 'utf-8' : $header['Content-Type'];
$waitTime = microtime(true) + mt_rand(static::$waitMin, static::$waitMax);
return new static($response, $contentType);
}
public function findXPath($query, $element = null)
{
if (is_array($query)) {
$query = buildXPath($query);
}
if (! empty($element)) {
$query = preg_replace('/^\/\//', '', $query);
$query = preg_replace('/^\//', '', $query);
}
$xpath = new DOMXPath($this->doc);
return $xpath->query(trim($query), $element);
}
public function findCSSPath($query, $element = null)
{
// # を @id に
$query = preg_replace('/\#([a-z0-9\-_]+?)(\s|$)/i', '[@id="\1"] ', $query);
// . を contains(class) に
$query = preg_replace_callback('/\.([^\s]+?)(\s|$)/i', function ($f) {
$classes = explode('.', $f[1]);
$query = [];
foreach ($classes as $class) {
$query[] = 'contains(@class,"' . $class . '")';
}
return '[' . implode('%s%and%s%', $query) . '] ';
}, $query);
// > を / に
$query = preg_replace('/([a-z0-9\]])\s*>\s*([a-z0-9.[#])/i', '\1/\2', $query);
// 空白 を // に
$query = preg_replace('/([a-z0-9\]])\s([a-z0-9.[#])/i', '\1//\2', $query);
$query = preg_replace('/(^|\s)(\*)\s([a-z0-9.[#])/i', '\2//\3', $query);
// %s% を 空白に
$query = str_replace('%s%', ' ', $query);
// 整合性がおかしいところを雑に直す
$query = '//' . preg_replace('/^\[/', '*[', $query);
$query = str_replace('/[', '/*[', $query);
return $this->findXPath($query, $element);
}
public function buildXPath($pathObject)
{
$path = '';
foreach ($pathObject as $k => $v) {
if ($k === 'hasClass') {
$path .= '[';
foreach ($v as $vv) {
$path .= 'contains(@class, "' . $vv . '") and ';
}
$path = preg_replace('/\sand\s$/', '', $path) . ']';
continue;
}
$path .= $v;
}
return $path;
}
public function trimingSpace($element)
{
if (! is_string($element)) {
if (empty($element->textContent)) {
return '';
}
$element = $element->textContent;
}
return trim(preg_replace('/\s\s+/', ' ', $element));
}
public function getZippedIterator($options)
{
for ($i = 0; $i < $options['count']; $i++) {
$zipped = [];
foreach ($options['elements'] as $k => $v) {
$zipped[$k] = $v[$i];
}
yield $zipped;
}
}
public function getParsedDlTag($dlElement)
{
$dt = $this->findCSSPath('dt', $dlElement);
$dd = $this->findCSSPath('dd', $dlElement);
$option = [
'count' => $dt->length,
'elements' => compact('dt', 'dd'),
];
$results = [];
foreach ($this->getZippedIterator($option) as $dtdd) {
$key = $this->trimingSpace($dtdd['dt']);
$value = $this->trimingSpace($dtdd['dd']);
if (empty($key)) {
$results[] = $value;
} else {
$results[$key] = $value;
}
}
return $results;
}
public function getAttribute($tag, $attr)
{
foreach ($tag->attributes as $value) {
if ($attr === $value->nodeName) {
return $value->value;
}
}
return null;
}
public function getParsedATag($aTag)
{
return [
'title' => $this->trimingSpace($aTag),
'url' => $this->trimingSpace($this->getAttribute($aTag, 'href')),
];
}
}
class JSONDoc extends HTMLDoc
{
public function __construct($html = '')
{
$this->doc = json_decode($html);
$this->raw = $html;
}
public function findXPath($query, $element = null)
{
}
public function findCSSPath($query, $element = null)
{
}
}
class Queue
{
public static $initialized = false;
public static $queueFile;
public static $queueDir;
public static function init($queueName='queue_list')
{
static::$queueFile = __DIR__ . '/' . $queueName;
file_put_contents(static::$queueFile, '', FILE_APPEND);
static::$queueDir = __DIR__ . '/queue';
if (! file_exists(static::$queueDir)) {
mkdir(static::$queueDir);
}
static::$initialized = true;
}
public static function reset()
{
if (file_exists(static::$queueDir)) {
$iterator = new DirectoryIterator(static::$queueDir);
foreach ($iterator as $fileinfo) {
if ($fileinfo->isDot()) {
continue;
}
if ($fileinfo->isDir()) {
if (deleteContent($fileinfo->getPathname())) {
@rmdir($fileinfo->getPathname());
}
}
if ($fileinfo->isFile()) {
@unlink($fileinfo->getPathname());
}
}
@rmdir(static::$queueDir);
}
@unlink(static::$queueFile);
static::$initialized = false;
}
public static function push($name, $obj)
{
if (static::$initialized === false) {
throw new Exception();
}
if (file_exists(static::$queueDir . '/' . $name)) {
$name .= '_' . md5(mt_rand());
}
file_put_contents(static::$queueDir . '/' . $name, serialize($obj));
file_put_contents(static::$queueFile, $name . "\n", FILE_APPEND);
}
public static function pop()
{
if (static::$initialized === false) {
throw new Exception();
}
$content = trim(file_get_contents(static::$queueFile));
if (empty($content)) {
throw new Exception();
}
$dummy = explode("\n", $content);
$name = array_shift($dummy);
file_put_contents(static::$queueFile, implode("\n", $dummy));
$name = static::$queueDir . '/' . $name;
$data = unserialize(file_get_contents($name));
unlink($name);
return $data;
}
public static function getList()
{
$content = trim(file_get_contents(static::$queueFile));
return explode("\n", $content);
}
}
Queue::init();
class Cache
{
public static $cacheDir = __DIR__ . '/cache/';
public static $subDirectoryDepth = 1;
public static function init()
{
if (!file_exists(static::$cacheDir)) {
mkdir(static::$cacheDir);
}
}
public static function path($name, $split=false)
{
$hash = sha1($name);
$subDir = '';
for ($i = 0; $i < static::$subDirectoryDepth; $i++) {
$subDir .= substr($hash, $i*2, 2) . '/';
}
if ($split === false) {
return static::$cacheDir . $subDir . $hash;
} else {
return [static::$cacheDir . $subDir, $hash];
}
}
public static function save($name, $data)
{
$path = static::path($name, true);
if (!file_exists($path[0])) {
mkdir($path[0], 0777, true);
}
@file_put_contents($path[0] . $path[1], serialize($data));
}
public static function load($name)
{
return @unserialize(file_get_contents(static::path($name)));
}
public static function delete($name)
{
@unlink(static::path($name));
}
public static function exists($name)
{
return file_exists(static::path($name));
}
}
Cache::init();
function array_flatten($arr)
{
return iterator_to_array(new \RecursiveIteratorIterator(new \RecursiveArrayIterator($arr)), false);
}
function invalid_string_replace($html)
{
$utf_escape_patterns = [
'/\xE3\x80\x9C/' => "\xEF\xBD\x9E",
'/\xE2\x88\x92/' => "\xEF\xBC\x8D",
'/\xE2\x80\x96/' => "\xE2\x88\xA5",
'/\xC2\xA2/' => "\xEF\xBF\xA0",
'/\xC2\xA3/' => "\xEF\xBF\xA1",
'/\xC2\xAC/' => "\xEF\xBF\xA2",
];
return preg_replace(
array_keys($utf_escape_patterns),
array_values($utf_escape_patterns),
$html
);
}
<?php
/**
* テスト用のヘルパ群
*/
class TestHelper
{
/** テストように作成したファイル群 */
private static $testFiles = [];
/**
* @param $msg
*/
public static function o($msg = '')
{
echo "{$msg}";
}
/**
* @param $msg
*/
public static function out($msg = '')
{
echo "{$msg}\n";
}
/**
* ダミーファイルの生成
*/
public static function createDummyFile($filename, $contents)
{
static::$testFiles[] = $filename;
file_put_contents($filename, $contents);
}
/**
* ファイル初期化処理
*/
public static function unlinkDummyFiles()
{
foreach (static::$testFiles as $file) {
unlink($file);
}
static::$testFiles = [];
}
/**
* $a === $b
*/
public static function assertEqual($expect, $actual)
{
if ($expect !== $actual) {
$msg = "";
$msg .= "assertEqual:\n";
$msg .= "\texpect = " . serialize($expect) . "\n";
$msg .= "\tactual = " . serialize($actual) . "\n";
throw new TestError($msg);
}
}
/**
* a in b
*/
public static function assertContain($expect, $actual)
{
if (is_string($actual)) {
if (strpos($actual, $expect) === false) {
throw new TestError("assertContain:\n\texpect = {$expect}\n\tactual = {$actual}");
}
}
if (is_array($actual)) {
if (in_array($expect, $actual) === false) {
throw new TestError("assertContain:\n\texpect = {$expect}\n\tactual = {$actual}");
}
}
}
/**
* a in (run func)
*/
public static function assertBuffering($expect, $func)
{
ob_start();
$func();
$result = ob_get_clean();
static::assertContain($expect, $result);
}
/**
* waiting assertion
*/
public static function assertWaitingTime($timeSec, $func, $errorRange = 0.2)
{
$startTime = microtime(true);
$func();
$endTime = microtime(true);
$diffTime = $endTime - $startTime;
$expectLarge = ($timeSec + $errorRange);
$expectSmall = ($timeSec - $errorRange);
if ($diffTime > $expectLarge || $diffTime < $expectSmall) {
throw new TestError("assertWaitingTime:\n\texpect = {$timeSec}\n\tactual = {$diffTime}");
}
}
/**
* throwable assertion
*/
public static function assertException($func, $expect = Exception::class)
{
try {
$func();
} catch (\Throwable $ex) {
static::assertEqual($expect, get_class($ex));
return;
}
throw new TestError("assertException:\n\texpect = {$expect}\n\tactual = NULL");
}
}
/**
* テスト実行時のアサートとかそういう
*/
class TestError extends Exception
{
}
/**
* テスト実行する君
*/
class TestRunner
{
public static $tests = [];
public static function define($name, $func)
{
static::$tests[$name] = $func;
}
public static function run()
{
$count = [
'success' => 0,
'fail' => 0,
'error' => 0,
];
$errors = [];
foreach (static::$tests as $name => $test) {
try {
TestHelper::unlinkDummyFiles();
$test();
TestHelper::unlinkDummyFiles();
TestHelper::o('.');
$count['success']++;
} catch (\TestError $error) {
TestHelper::o('F');
$errors[$name] = $error;
$count['fail']++;
} catch (\Throwable $ex) {
TestHelper::o('E');
$errors[$name] = $ex;
$count['error']++;
}
}
TestHelper::out();
if (count($errors) > 0) {
TestHelper::out();
}
foreach ($errors as $name => $error) {
TestHelper::out("[{$name}]\n" . $error->getMessage());
$trace = $error->getTrace();
TestHelper::out("\t#1 " . $trace[0]['file'] . ':' . $trace[0]['line']);
if (!empty($trace[1])) {
TestHelper::out("\t#2 " . $trace[1]['file'] . ':' . $trace[1]['line']);
}
if (!empty($trace[2])) {
TestHelper::out("\t#3 " . $trace[2]['file'] . ':' . $trace[2]['line']);
}
}
TestHelper::out();
foreach ($count as $type => $num) {
TestHelper::out("{$type}\t: {$num}");
}
static::$tests = [];
return $count['fail'] > 0 || $count['error'] > 0;
}
}
//==============================
// 構文チェック
//==============================
TestRunner::define('PHP構文が問題ないこと', function () {
$file = __DIR__ . '/scraping_kit.txt';
$testFile = $file . '.test.php';
TestHelper::createDummyFile($testFile, "<?php\n" . file_get_contents($file));
$result = exec("php -l {$testFile}");
TestHelper::assertContain('No syntax errors', $result);
});
//==============================
// テスト起動
// ここだけ先に実施
//==============================
$failed = TestRunner::run();
if ($failed) {
exit(1);
}
// 読み込み
eval(file_get_contents(__DIR__ . '/scraping_kit.txt'));
TestHelper::out("\nLoad crawl kit: OK\n");
//==============================
// Console
//==============================
TestRunner::define('already Console::init', function () {
TestHelper::assertContain('result__test.csv', Console::$resultFile);
});
TestRunner::define('Console::out indent', function () {
TestHelper::assertBuffering(' test', function () {
Console::out('test', 2);
});
});
TestRunner::define('Console::getListFile', function () {
$list = ['coffee,tea,milk', 'cat,dog'];
TestHelper::createDummyFile('list.csv', implode("\n", $list));
foreach (Console::getListFile('list.csv') as $index => $row) {
$ex = explode(',', $list[$index]);
TestHelper::assertEqual($ex, $row);
};
});
TestRunner::define('Console::outputResult', function () {
TestHelper::createDummyFile(Console::$resultFile, '');
Console::outputResult(['hoge', 'huga']);
TestHelper::assertContain('"hoge","huga"', file_get_contents(Console::$resultFile));
Console::outputResult('hogehoge');
TestHelper::assertContain('hogehoge', file_get_contents(Console::$resultFile));
});
TestRunner::define('Console::sleep functions', function () {
// 大きくずれなければいいよ
//TestHelper::assertWaitingTime(1, function() {
// Console::sleep(1);
//});
TestHelper::assertWaitingTime(0.1, function () {
Console::msleep(1);
}, 0.1);
TestHelper::assertWaitingTime(0.001, function () {
Console::usleep(1);
}, 0.1);
});
//==============================
// キュー
//==============================
TestRunner::define('arleady Queue::init. reset', function () {
TestHelper::assertContain('queue_list', Queue::$queueFile);
TestHelper::assertEqual(true, file_exists(Queue::$queueFile));
TestHelper::assertEqual(true, file_exists(Queue::$queueDir));
Queue::reset();
TestHelper::assertEqual(false, file_exists(Queue::$queueFile));
TestHelper::assertEqual(false, file_exists(Queue::$queueDir));
Queue::reset();
Queue::init();
});
TestRunner::define('Queue::push, pop', function () {
$input = 'hogehoge';
Queue::push('first', $input);
Queue::push('second', $input);
TestHelper::assertEqual("first\nsecond\n", file_get_contents(Queue::$queueFile));
TestHelper::assertEqual("s:8:\"hogehoge\";", file_get_contents(Queue::$queueDir . '/first'));
TestHelper::assertEqual("s:8:\"hogehoge\";", file_get_contents(Queue::$queueDir . '/second'));
TestHelper::assertEqual($input, Queue::pop());
TestHelper::assertEqual($input, Queue::pop());
TestHelper::assertException(function () {
Queue::pop();
});
Queue::push('dummy', $input);
Queue::reset();
TestHelper::assertException(function () {
Queue::push('', []);
});
TestHelper::assertException(function () {
Queue::pop();
});
Queue::reset();
Queue::init();
});
TestRunner::define('Queue::GetList', function () {
Queue::push('test_1', 1);
Queue::push('test_2', 2);
Queue::push('test_3', 3);
Queue::push('test_4', 4);
Queue::push('test_5', 5);
TestHelper::assertEqual(5, count(Queue::getList()));
TestHelper::assertEqual('test_1', Queue::getList()[0]);
TestHelper::assertEqual('test_2', Queue::getList()[1]);
TestHelper::assertEqual('test_3', Queue::getList()[2]);
TestHelper::assertEqual('test_4', Queue::getList()[3]);
TestHelper::assertEqual('test_5', Queue::getList()[4]);
Queue::reset();
Queue::init();
});
TestRunner::define('Remove Queue', function () {
rmdir(Queue::$queueDir);
unlink(Queue::$queueFile);
});
//==============================
// キャッシュ
//==============================
TestRunner::define('arleady Cache::init', function () {
TestHelper::assertEqual(true, file_exists(Cache::$cacheDir));
});
TestRunner::define('Cache::save, load, delete', function () {
Cache::$subDirectoryDepth = 0;
$expect = [10, 20, 'ok', true];
Cache::save('test', $expect);
TestHelper::assertEqual($expect, Cache::load('test'));
Cache::delete('test');
TestHelper::assertEqual(false, file_exists(Cache::path('test')));
});
TestRunner::define('Cache::save, load, delete. depth=1', function () {
Cache::$subDirectoryDepth = 1;
TestHelper::assertContain('/a9/a9', Cache::path('test'));
$expect = [10, 20, 'ok', true];
Cache::save('test', $expect);
TestHelper::assertEqual($expect, Cache::load('test'));
Cache::delete('test');
TestHelper::assertEqual(false, file_exists(Cache::path('test')));
});
TestRunner::define('Cache::save, load, delete. depth=2', function () {
Cache::$subDirectoryDepth = 2;
TestHelper::assertContain('/a9/4a/a94a', Cache::path('test'));
$expect = [10, 20, 'ok', true];
Cache::save('test', $expect);
TestHelper::assertEqual($expect, Cache::load('test'));
Cache::delete('test');
TestHelper::assertEqual(false, file_exists(Cache::path('test')));
});
TestRunner::define('Remove Cache dir', function () {
@shell_exec('rm -r ' . escapeshellarg(Cache::$cacheDir));
});
//==============================
// グローバル関数
//==============================
TestRunner::define('array_flatten', function () {
TestHelper::assertEqual([0, 1, 2, 3], array_flatten([[0], [[1], [[2, [3]]]]]));
});
TestRunner::define('invalid_string_replace', function () {
TestHelper::assertEqual('~', invalid_string_replace("\xEF\xBD\x9E"));
});
//==============================
// テスト起動
//==============================
$failed = TestRunner::run();
if ($failed) {
exit(1);
}
exit(0);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment