Skip to content

Instantly share code, notes, and snippets.

@pavel-voronin
Last active August 29, 2015 13:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pavel-voronin/e1656b6a3c2cde6c4f83 to your computer and use it in GitHub Desktop.
Save pavel-voronin/e1656b6a3c2cde6c4f83 to your computer and use it in GitHub Desktop.
The-Tale.org accounts info grabber
#!/bin/sh
for date_part in $(ls -1 result/*.csv | rev | cut -c 14- | rev | uniq | awk -v date=`date +"%Y-%m-%d"` 'substr($0, length($0) - 9) != date')
do
if [ -d result/temp ]; then
rm -rf result/temp
fi
mkdir result/temp
mv $date_part-* result/temp
prev=''
for O in $(ls -1 result/temp/*.csv)
do
if [ $prev ]; then
diff -u0 $prev $O > $O.diff
fi
prev=$O
done
ls -1 result/temp/*.csv | tail -n+2 | xargs rm -f
tar -zcf $date_part.tar.gz -C result/temp/ .
rm -rf result/temp
done
#!/usr/local/bin/php
<?php
$domain = 'http://the-tale.org';
$common_pattern = '/<tr>\s*<td>(\d+)<\/td>\s*<td><a href="\/accounts\/(\d+)">([^<]+)<\/a><\/td>\s*<td>(\w+)\s*(\w+)\s*<a href="\/game\/heroes\/(\d+)">([^<]+)<\/a><\/td>\s*<td>([^<]+)<\/td>\s*<\/tr>/su';
$urls = [
'help_count' => ['/game/ratings/help_count?page=', $common_pattern],
'achievements_points' => ['/game/ratings/achievements_points?page=', $common_pattern],
'referrals_number' => ['/game/ratings/referrals_number?page=', $common_pattern],
'pvp_battles_1x1_victories' => ['/game/ratings/pvp_battles_1x1_victories?page=', $common_pattern],
'pvp_battles_1x1_number' => ['/game/ratings/pvp_battles_1x1_number?page=', $common_pattern],
'phrases' => ['/game/ratings/phrases?page=', $common_pattern],
'level' => ['/game/ratings/level?page=', $common_pattern],
'power' => ['/game/ratings/power?page=', $common_pattern],
'bills' => ['/game/ratings/bills?page=', $common_pattern],
'might' => ['/game/ratings/might?page=', $common_pattern],
'accounts' => ['/accounts/?prefix=&amp;page=', '/<tr class="pgf-account-record">\s*<td>\s*<a href="\/accounts\/(\d+)">([^<]+)<\/a>\s*(?:<a href="\/accounts\/clans\/(\d+)">\[([^\]]+)\]<\/a>\s*)?<\/td>\s*<td>(\w+)<\/td>\s*<td class="updated-at pgf-format-date" data-timestamp="(\d+).0"><\/td>\s*<td>\s*(\w+)\s*(\w+)\s*<a href="\/game\/heroes\/(\d+)">([^<]+)<\/a>\s*<\/td>\s*<td>(\d+)<\/td>\s*<td>(\d+)<\/td>\s*<\/tr>/su'],
];
function checkEnvironment()
{
if(php_sapi_name() !== 'cli')
die('Run me from console, not web.' . PHP_EOL);
global $argv, $argc;
if(!isset($argc) || !isset($argv))
die('Where are you talking to me from?' . PHP_EOL);
if(!is_writable('.'))
die('Current directory is not writable by me.' . PHP_EOL);
if(!file_exists('result'))
{
@mkdir('result');
if(!file_exists('result'))
die('I cannot create result folder. Check my rights.' . PHP_EOL);
@chmod('result', 0777);
if(!is_writable('result'))
die('Result directory is not writable by me.' . PHP_EOL);
}
elseif(!is_dir('result'))
die('`Result\' is not a directory. Move it out of current directory and restart me.' . PHP_EOL);
}
function getProject()
{
if(!file_exists('.project'))
return null;
$project = @require_once('.project');
if(!is_array($project))
die('Project file is corrupted. Delete it and restart me.' . PHP_EOL);
checkProject($project);
return $project;
}
function getUrl($part)
{
return $domain . urldecode($part);
}
function checkProject($project)
{
global $urls;
foreach($project as $url)
{
foreach($urls as $one)
if($url[0] === $one[0])
continue 2;
die('Unknown url: ' . $url[0] . PHP_EOL);
}
}
function help($command = null, $arg = null)
{
if($command == null || $command == 'help' && $arg == null)
{
echo 'Usage: php ' . basename(__FILE__) . ' help this help page' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' help <command> specific command help' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' urls show all known urls' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' start <all|url1 url2 ... urlN> start grabbing of all urls or one or many specific urls' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' batch <all|url1 url2 ... urlN> create batch project for grabbing of all urls or one or many specific urls' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' step continue current batch project (1 step)' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' finish continue and finish current batch project' . PHP_EOL;
echo ' php ' . basename(__FILE__) . ' clean clean current ' . PHP_EOL;
}
elseif($command == 'help')
{
if($arg == 'help')
echo 'Do you love recursions, ah? :)' . PHP_EOL;
elseif($arg == 'urls')
{
echo 'Usage: php ' . basename(__FILE__) . ' urls' . PHP_EOL;
echo PHP_EOL;
echo ' Show all known urls (names). You can use them with commands `start\' and `batch\'.' . PHP_EOL;
}
elseif($arg == 'start')
{
echo 'Usage: php ' . basename(__FILE__) . ' start <all|url1 url2 ... urlN>' . PHP_EOL;
echo PHP_EOL;
echo ' If `all\' keyword specified, start grabbing of all known urls immediately.' . PHP_EOL;
echo ' Otherwise, if one or some url names specified, start grabbing of this specific urls immediately.' . PHP_EOL;
}
elseif($arg == 'batch')
{
echo 'Usage: php ' . basename(__FILE__) . ' batch <all|url1 url2 ... urlN>' . PHP_EOL;
echo PHP_EOL;
echo ' If `all\' keyword specified, create new project for grabbing of all known urls immediately.' . PHP_EOL;
echo ' Otherwise, if one or some url names specified, create new project for grabbing of this specific urls immediately.' . PHP_EOL;
echo ' In all cases grabbing will not start immediately, but you should start every step with yourself with instructions shown after project creating.' . PHP_EOL;
}
elseif($arg == 'step')
{
echo 'Usage: php ' . basename(__FILE__) . ' step' . PHP_EOL;
echo PHP_EOL;
echo ' Make one step of current project made with `batch\' command. If you want to start new project, `clean\' the old one.' . PHP_EOL;
}
elseif($arg == 'finish')
{
echo 'Usage: php ' . basename(__FILE__) . ' finish' . PHP_EOL;
echo PHP_EOL;
echo ' Make all steps of current project made with `batch\' command. If you want to start new project, `clean\' the old one.' . PHP_EOL;
echo ' My default command if project was started.' . PHP_EOL;
}
elseif($arg == 'clean')
{
echo 'Usage: php ' . basename(__FILE__) . ' clean' . PHP_EOL;
echo PHP_EOL;
echo ' Delete current project files and project itself from current directory. Result files will stay.' . PHP_EOL;
}
else
echo 'I don\'t know this command. Sorry, dude.' . PHP_EOL;
}
exit(0);
}
function knownUrls()
{
global $urls;
echo 'Known url names: ' . implode(', ' , array_keys($urls)) . PHP_EOL;
}
function startProject($urls)
{
echo 'I am starting new project for you!' . PHP_EOL;
$project = createProject($urls);
finishProject($project);
}
function finishProject($project)
{
while($project = stepProject($project, true));
}
function calculateSize($url)
{
global $domain;
if(($doc = file_get_contents($domain . urldecode($url[0]) . 1)) === false)
die('I failed to grab url ' . $domain . urldecode($url[0]) . $url[4] . ' to examine total pages count :(' . PHP_EOL);
$url[3] = (int)preg_replace('/[^\d]/', '', mb_substr($doc, mb_strrpos($doc, $url[0]) + mb_strlen($url[0]), 3));
$url[3] = $url[3] ? $url[3] : 1;
return $url;
}
function calculateSizes($project)
{
global $domain;
foreach($project as $id => $url)
$project[$id] = calculateSize($url);
saveProject($project);
return $project;
}
function batchProject($urls)
{
echo 'I am batching new project for you!' . PHP_EOL;
$project = createProject($urls);
$project = calculateSizes($project);
foreach($project as $urlName => $url)
if(!isset($url[3]))
die('I don\'t know, why, but i don\'t know, how many pages are in url ' . $urlName . '.' . PHP_EOL);
if(($shFile = fopen('start.sh', 'w')) === false)
die('I cannot create start.sh. Do I have enough rights?' . PHP_EOL);
foreach($project as $urlName => $url)
for($i = 0; $i < $url[3] + 2; $i++) // 1 for compiling
fwrite($shFile, 'php ' . basename(__FILE__) . ' step' . PHP_EOL);
fclose($shFile);
echo 'Done! Now you can run `sh start.sh\' to process all the project OR you can use me that way: `php ' . basename(__FILE__) . ' step\' to get things done step-by-step.' . PHP_EOL;
}
function stepProject($project, $continue = false)
{
global $domain;
foreach($project as $urlName => $url)
{
if(isset($url[5]))
continue;
if(!isset($url[2]))
{
$url[2] = $urlName;
@mkdir($url[2]);
@chmod($url[2], 0777);
if(!file_exists($url[2]))
die('I am unable to create a project directory :(' . PHP_EOL);
if(!is_writable($url[2]))
{
@rmdir($url[2]);
die('Created project folder is not writable. I\'m sorry.' . PHP_EOL);
}
echo 'I have created directory ' . $url[2] . ' for url ' . $urlName . '.' . PHP_EOL;
$project[$urlName] = $url;
saveProject($project);
}
if(!isset($url[3]))
{
$url = calculateSize($url);
$project[$urlName] = $url;
saveProject($project);
echo 'Url ' . $urlName . ' have ' . $url[3] . ' pages.' . PHP_EOL;
if($continue)
return $project;
else
exit(0);
}
if(!isset($url[4]))
{
$url[4] = 1;
$project[$urlName] = $url;
saveProject($project);
}
if($url[3] < $url[4])
{
$csvName = 'result/' . $urlName . '-' . date('Y-m-d-H-i-s') . '.csv';
echo 'I have fully grabbed url ' . $urlName . ' and now I am trying to compile it.' . PHP_EOL;
if(($csv = fopen($csvName, 'w')) === false)
die('I cannot create csv file. Do I have enough rights?' . PHP_EOL);
foreach(scandir($url[2], SCANDIR_SORT_NONE) as $file)
{
if($file == '.' || $file == '..')
continue;
fwrite($csv, file_get_contents($url[2] . '/' . $file));
}
fclose($csv);
$url[5] = true;
$project[$urlName] = $url;
saveProject($project);
echo 'Done! I have compiled data from url ' . $urlName . ' into ' . $csvName . PHP_EOL;
if($continue)
return $project;
else
exit(0);
}
echo 'I am starting a new step of grabbing url ' . $urlName . ' (' . $url[4] . '/' . $url[3] . ')' . PHP_EOL;
if(($doc = file_get_contents($domain . urldecode($url[0]) . $url[4])) === false)
die('I failed to grab url ' . $domain . urldecode($url[0]) . $url[4] . ' :( Try `step\' later.' . PHP_EOL);
preg_match_all($url[1], $doc, $data, PREG_SET_ORDER);
foreach($data as $id => $one)
unset($data[$id][0]);
if(($csv = fopen($url[2] . '/' . $url[4], 'w')) === false)
die('I cannot create piece of data csv file. Do I have enough rights?' . PHP_EOL);
foreach($data as $o)
fputcsv($csv, $o);
fclose($csv);
$url[4]++;
$project[$urlName] = $url;
saveProject($project);
echo 'Done!' . PHP_EOL;
if($continue)
return $project;
else
exit(0);
}
echo 'Wow! I have finished my work and now i\' m gonna clean this project.' . PHP_EOL;
cleanProject($project);
echo 'Done! Have a nice day!' . PHP_EOL;
if($continue)
return false;
else
exit(0);
return false;
}
function saveProject($project)
{
file_put_contents('.project', '<?php' . PHP_EOL . 'return ' . var_export($project, true) . ';');
}
function createProject($need_urls)
{
global $urls;
if($need_urls === 'all')
$project = $urls;
else
{
$project = [];
foreach($need_urls as $url)
if(isset($urls[$url]))
$project[$url] = $urls[$url];
else
die('Unknown url name! Please, use only names from `urls\' command.' . PHP_EOL);
}
saveProject($project);
echo 'I have created project for urls: ' . implode(', ', array_keys($project)) . PHP_EOL;
return $project;
}
function cleanProject($project)
{
@unlink('.project');
if(file_exists('.project'))
die('I failed to delete project! What is wrong with my rights?' . PHP_EOL);
if(file_exists('start.sh'))
{
@unlink('start.sh');
if(file_exists('start.sh'))
die('I failed to delete start.sh! What is wrong with my rights?' . PHP_EOL);
}
foreach($project as $url)
{
if(isset($url[2]))
{
foreach(scandir($url[2]) as $path)
{
if($path == '.' || $path == '..')
continue;
@unlink($url[2] . '/' . $path);
if(file_exists($url[2] . '/' . $path))
die('I failed to delete project files! What is wrong with my rights?' . PHP_EOL);
}
@rmdir($url[2]);
if(file_exists($url[2]))
die('I failed to delete project files! What is wrong with my rights?' . PHP_EOL);
}
}
}
function route()
{
global $argc, $argv, $project;
if($argc == 1)
if($project === null)
help();
else
$project = finishProject($project);
else
if($argv[1] == 'help')
if(isset($argv[2]))
help($argv[1], $argv[2]);
else
help($argv[1]);
elseif($argv[1] == 'urls')
knownUrls();
elseif($argv[1] == 'start')
{
if($project !== null)
die('I can\'t because of existing project. `clean\' it first.' . PHP_EOL);
if($argc < 3)
die('What urls should I grab, master? Maybe, all of them?' . PHP_EOL);
if(in_array('all', $argv))
startProject('all');
else
startProject(array_slice($argv, 2));
}
elseif($argv[1] == 'batch')
{
if($project !== null)
die('I can\'t because of existing project. `clean\' it first.' . PHP_EOL);
if($argc < 3)
die('What urls should I grab, master? Maybe, all of them?' . PHP_EOL);
if(in_array('all', $argv))
batchProject('all');
else
batchProject(array_slice($argv, 2));
}
elseif($argv[1] == 'step')
{
if($project !== null)
$project = stepProject($project);
else
die('No one project was started. First run `batch\' command.' . PHP_EOL);
}
elseif($argv[1] == 'finish')
{
if($project !== null)
$project = finishProject($project);
else
die('No one project was started. First run `batch\' command.' . PHP_EOL);
}
elseif($argv[1] == 'clean')
{
if($project !== null)
cleanProject($project);
exit(0);
}
}
checkEnvironment();
$project = getProject();
route();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment