-
-
Save Koopzington/06a788ac95478b9d809d56aa1a85a094 to your computer and use it in GitHub Desktop.
๐๐๐๐๐๐๐๐๐๐ good shit goเฑฆิ sHit๐ thats โ some good๐๐shit right๐๐there๐๐๐ rightโthere โโif i do ฦฝaาฏ so my self ๐ฏ i say so ๐ฏ thats what im talking about right there right there (chorus: สณแถฆแตสฐแต แตสฐแตสณแต) mMMMMแทะ๐ฏ ๐๐ ๐ะO0ะเฌ OOOOOะเฌ เฌ Ooooแตแตแตแตแตแตแตแตแต๐ ๐๐ ๐ ๐ฏ ๐ ๐ ๐ ๐ ๐๐Good shit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require 'vendor/autoload.php'; | |
require 'proxygrabber.php'; | |
const XML_FILE_NAME = 'data/anime-titles.xml'; | |
const ARCHIVE_NAME = 'data/anime-titles.gz'; | |
/** | |
* Decompresses a gunzip arhive | |
* | |
* @param $src string archive name | |
* @param $dst string destination file name | |
*/ | |
function decompress($src, $dst): void | |
{ | |
$s = gzopen($src, "rb"); | |
$d = fopen($dst, "w"); | |
while ($string = gzread($s, 4096)) { | |
fwrite($d, $string, strlen($string)); | |
} | |
gzclose($s); | |
fclose($d); | |
} | |
/** | |
* Checks whether we received an error response. | |
* | |
* @param $data string XML file contents | |
* @return int | |
*/ | |
function error($data): int | |
{ | |
switch ($data) { | |
case '<error>Anime not found</error>': | |
return 1; | |
break; | |
case '<error code="500">banned</error>': | |
return 2; | |
break; | |
default: | |
return 0; | |
break; | |
} | |
} | |
/** | |
* Oof, checks XML anime-titles file modification time, if it's old or doesn't exist | |
* downloads it from AniDB, parses it and returns an array of IDs that we can iterate over. | |
* | |
* @return array Array of anime IDs | |
* @throws \GuzzleHttp\Exception\GuzzleException | |
*/ | |
function getAnimuList(): array | |
{ | |
$client = new \GuzzleHttp\Client(['http_errors' => false]); | |
if (!file_exists(XML_FILE_NAME) || (time() - filemtime(XML_FILE_NAME) >= 86400)) { | |
print "Downloading the titles archive\n"; | |
$client->request('GET', 'http://anidb.net/api/anime-titles.xml.gz', [ | |
'headers' => [ | |
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
], | |
'proxy' => getProxy(), | |
'sink' => ARCHIVE_NAME | |
]); | |
decompress(ARCHIVE_NAME, XML_FILE_NAME); | |
unlink(ARCHIVE_NAME); | |
} | |
$xml = simplexml_load_file(XML_FILE_NAME, "SimpleXMLElement", LIBXML_NOCDATA); | |
// $xml = simplexml_load_string($response->getBody(), "SimpleXMLElement", LIBXML_NOCDATA); | |
$json = json_encode($xml); | |
$shitty_dump = json_decode($json, TRUE); | |
$shitty_cartoons = []; | |
foreach ($shitty_dump['anime'] as $shitty_cartoon) { | |
$shitty_cartoons[] = (int)$shitty_cartoon['@attributes']['aid']; | |
} | |
// just shuffle the id array to bypass their *dank* anti-scraping *algorithm* | |
shuffle($shitty_cartoons); | |
return $shitty_cartoons; | |
} | |
$proxy = getProxy(); | |
// instantiate the guzzle client with the "alastore" client user-agent header | |
$client = new \GuzzleHttp\Client([ | |
'debug' => true, | |
'headers' => [ | |
'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', | |
], | |
]); | |
// Try to load the anime titles | |
$animu_list = getAnimuList(); | |
// Request options for our query | |
$request_options = [ | |
'proxy' => $proxy, | |
'http_errors' => false, | |
'connect_timeout' => 30, | |
]; | |
// Iterate over the IDs | |
foreach ($animu_list as $animu_id) { | |
// Skip this iteration if file exists.. | |
if (file_exists("data/$animu_id.xml")) { | |
continue; | |
} | |
$request_options['query'] = [ | |
'request' => 'anime', | |
'client' => 'alastorehttp', | |
'clientver' => 1, | |
'protover' => 1, | |
'aid' => $animu_id, | |
]; | |
// alastorehttp&clientver=1 | |
// goanidbhttp&clientver=1 | |
// Get clients name-version strings from https://wiki.anidb.net/w/UDP_Clients | |
// http://api.anidb.net:9001/httpapi?request=anime&client={client}&clientver={version}&protover=1&aid=12138 | |
try { | |
$response = $client->request('GET', 'http://api.anidb.net:9001/httpapi', $request_options); | |
$data = $response->getBody()->getContents(); | |
// If we get a bad response or we're banned we get a new proxy and skip this anime | |
// since they don't let the same client request the same data-set after a ban. | |
// This means this shitty script needs to be executed twice to fill the missing data | |
if ( ($response->getStatusCode() != 200) || (error($data) === 2) ) { | |
$request_options['proxy'] = getProxy(); // get a new proxy | |
print "Banned, got a new proxy: {$request_options['proxy']}\n"; | |
sleep(2); | |
continue; | |
} elseif (error($data) === 1) { | |
print "Anime not found.\n"; | |
sleep(2); | |
continue; | |
} | |
} catch (Exception $e) { | |
// Proxy is probably dead | |
$request_options['proxy'] = getProxy(); | |
print "Timed out or some kind of exception, getting a new proxy: {$request_options['proxy']}'"; | |
$response = $client->request('GET', 'http://api.anidb.net:9001/httpapi', $request_options); | |
// We probably got a banned proxy | |
if ( ($response->getStatusCode() != 200) || (error($response->getBody()->getContents()) === 2) ) { | |
$request_options['proxy'] = getProxy(); // get a new proxy | |
sleep(2); | |
continue; | |
} | |
} | |
// Save the XML response | |
file_put_contents("data/$animu_id.xml", $data, LOCK_EX); | |
// sleep between 2 and 2.5 secs - in case they monitor request intervals from an IP | |
usleep(mt_rand(2000000, 2500000)); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Fetches a proxy from http://gimmeproxy.com/api/getProxy | |
* | |
* @return string | |
*/ | |
function getProxy(): string { | |
$client = new \GuzzleHttp\Client(); | |
$response = $client->get('http://gimmeproxy.com/api/getProxy'); | |
$data = json_decode($response->getBody(), true); | |
if(isset($data['error'])) { // there are no proxies left for this user-id and timeout | |
print $data['error']."\n"; | |
} | |
return $data['curl'] ?? false; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment