Last active
December 20, 2023 07:34
-
-
Save divinity76/79efd7b8c0d7849b956cd194659c98e5 to your computer and use it in GitHub Desktop.
misc PHP copypasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// find . -iname "*.php" -print0 | xargs -0 --max-args=1 --max-procs=$(nproc) '-I{}' sh -c 'php --syntax-check {} || true' | grep --invert-match "^No syntax errors detected in" | |
function json_encode_pretty($data, int $extra_flags = 0, int $exclude_flags = 0): string | |
{ | |
// prettiest flags for: 7.3.9 | |
$flags = JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | (defined("JSON_UNESCAPED_LINE_TERMINATORS") ? JSON_UNESCAPED_LINE_TERMINATORS : 0) | JSON_PRESERVE_ZERO_FRACTION | (defined("JSON_THROW_ON_ERROR") ? JSON_THROW_ON_ERROR : 0); | |
$flags = ($flags | $extra_flags) & ~ $exclude_flags; | |
return (json_encode($data, $flags)); | |
} | |
/** | |
* generate command to echo (binary?) data to stdout | |
* | |
* @param string $binary | |
* the (optionally binary) data to echo | |
* @param int $max_ish_line_length | |
* the circa-max line length for the data (PS! it's not always accurate, sometimes it wraps at *circa* this length) | |
* @return string | |
*/ | |
function generateBinaryEcho(string $binary, int $max_ish_line_length = 50): string | |
{ | |
$inner_max_ish_line_length = (- strlen("'\\")) + $max_ish_line_length; | |
$ret = ""; | |
// http://www.asciitable.com/ | |
$specialAsciiWhitelist = " !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; | |
$line_length = strlen("echo -ne '"); | |
$translations = [ | |
"\\" => "\\\\", | |
'\'' => '\'\\\'\'', | |
"\n" => "\\n", | |
"\r" => "\\r", | |
]; | |
for ($i = 0, $imax = strlen($binary); $i < $imax; ++ $i) { | |
if ($line_length >= $inner_max_ish_line_length) { | |
$ret .= "'\\\n'"; | |
$line_length = strlen("'"); | |
} | |
$translation = $binary[$i]; | |
if (isset($translations[$translation])) { | |
$translation = $translations[$translation]; | |
} elseif (ctype_alnum($translation) || ($i !== 0 && strpos($specialAsciiWhitelist, $translation) !== false)) { | |
// no action needed | |
} else { | |
// some binary-ish or unicode-ish data, hex-escape it.. | |
$translation = bin2hex($translation); | |
$translation = str_split($translation, 2); | |
$translation = '\\x' . implode('\\x', $translation); | |
} | |
$line_length += strlen($translation); | |
$ret .= $translation; | |
} | |
$ret = "echo -ne '" . $ret . "'"; | |
return $ret; | |
} | |
/** | |
* better version of shell_exec(), | |
* supporting both stdin and stdout and stderr and os-level return code | |
* | |
* @param string $cmd | |
* command to execute | |
* @param string $stdin | |
* (optional) data to send to stdin, binary data is supported. | |
* @param string $stdout | |
* (optional) stdout data generated by cmd | |
* @param string $stderr | |
* (optional) stderr data generated by cmd | |
* @param bool $print_std | |
* (optional, default false) if you want stdout+stderr to be printed while it's running, | |
* set this to true. (useful for long-running commands) | |
* @return int | |
*/ | |
function hhb_exec(string $cmd, string $stdin = "", string &$stdout = null, string &$stderr = null, bool $print_std = false): int | |
{ | |
$stdouth = tmpfile(); | |
$stderrh = tmpfile(); | |
$descriptorspec = array( | |
0 => array( | |
"pipe", | |
"rb" | |
), // stdin | |
1 => array( | |
"file", | |
stream_get_meta_data($stdouth)['uri'], | |
'ab' | |
), | |
2 => array( | |
"file", | |
stream_get_meta_data($stderrh)['uri'], | |
'ab' | |
) | |
); | |
$pipes = array(); | |
$proc = proc_open($cmd, $descriptorspec, $pipes); | |
while (strlen($stdin) > 0) { | |
$written_now = fwrite($pipes[0], $stdin); | |
if ($written_now < 1 || $written_now === strlen($stdin)) { | |
// ... can add more error checking here | |
break; | |
} | |
$stdin = substr($stdin, $written_now); | |
} | |
fclose($pipes[0]); | |
unset($stdin, $pipes[0]); | |
if (! $print_std) { | |
$proc_ret = proc_close($proc); // this line will stall until the process has exited. | |
$stdout = stream_get_contents($stdouth); | |
$stderr = stream_get_contents($stderrh); | |
} else { | |
$stdout = ""; | |
$stderr = ""; | |
stream_set_blocking($stdouth, false); | |
stream_set_blocking($stderrh, false); | |
$fetchstd = function () use (&$stdout, &$stderr, &$stdouth, &$stderrh): bool { | |
$ret = false; | |
$tmp = stream_get_contents($stdouth); // fread($stdouth, 1); // | |
if (is_string($tmp) && strlen($tmp) > 0) { | |
$ret = true; | |
$stdout .= $tmp; | |
fwrite(STDOUT, $tmp); | |
} | |
$tmp = stream_get_contents($stderrh);// fread($stderrh, 1); // | |
// var_dump($tmp); | |
if (is_string($tmp) && strlen($tmp) > 0) { | |
$ret = true; | |
$stderr .= $tmp; | |
fwrite(STDERR, $tmp); | |
} | |
return $ret; | |
}; | |
while (($status = proc_get_status($proc))["running"]) { | |
if (! $fetchstd()) { | |
// 100 ms | |
usleep(100 * 1000); | |
} | |
} | |
$proc_ret = $status["exitcode"]; | |
proc_close($proc); | |
$fetchstd(); | |
} | |
fclose($stdouth); | |
fclose($stderrh); | |
return $proc_ret; | |
} | |
function hhb_exec_parallel(array $cmds, int $max_concurrent_workers = 128, float $sleep_interval_seconds = 0.1): array | |
{ | |
$ret = []; | |
$workers = []; | |
$work = function () use (&$workers, &$ret, &$sleep_interval_seconds): int { | |
$closed_workers = 0; | |
if (count($workers) < 1) { | |
return $closed_workers; | |
} | |
for (;;) { | |
foreach ($workers as $worker_key => &$worker) { | |
$status = proc_get_status($worker["handle"]); | |
if ($status["running"]) { | |
continue; | |
} | |
proc_close($worker["handle"]); | |
++ $closed_workers; | |
$worker["stdout"] = stream_get_contents($worker["stdout_handle"]); | |
fclose($worker["stdout_handle"]); | |
$worker["stderr"] = stream_get_contents($worker["stderr_handle"]); | |
fclose($worker["stderr_handle"]); | |
unset($worker["handle"], $worker["stdout_handle"], $worker["stderr_handle"]); | |
unset($workers[$worker_key]); | |
$ret[] = $worker; | |
} | |
if ($closed_workers > 0) { | |
return $closed_workers; | |
} | |
// all workers are still busy. | |
usleep((int) ($sleep_interval_seconds * 1000000)); | |
} | |
unreachable(); | |
}; | |
foreach ($cmds as $cmd) { | |
while (count($workers) >= $max_concurrent_workers) { | |
$work(); | |
} | |
$curr = [ | |
"cmd" => $cmd, | |
"stdout_handle" => tmpfile(), | |
"stderr_handle" => tmpfile() | |
]; | |
$descriptorspec = array( | |
// if we don't create a stdin, the child will *inherit* ours, we don't want that to happen, | |
// so we create a stdin just to close it asap. | |
0 => array( | |
"pipe", | |
"rb" | |
), | |
1 => array( | |
"file", | |
stream_get_meta_data($curr["stdout_handle"])["uri"], | |
"wb" | |
), // stdout is a pipe that the child will write to | |
2 => array( | |
"file", | |
stream_get_meta_data($curr["stderr_handle"])["uri"], | |
"wb" | |
) // stderr is a file to write to | |
); | |
$pipes = []; | |
$curr["handle"] = proc_open($cmd, $descriptorspec, $pipes); | |
fclose($pipes[0]); | |
unset($pipes[0], $pipes); | |
$workers[] = $curr; | |
} | |
while (count($workers) > 0) { | |
$work(); | |
} | |
return $ret; | |
} | |
function validate_new_username(string $username, string &$error = NULL, int $min_len = 3, int $max_len = 20): bool | |
{ | |
if ($username !== ltrim($username)) { | |
$error = "starts with space(s)!"; | |
return false; | |
} | |
if ($username !== rtrim($username)) { | |
$error = "ends with space(s)!"; | |
return false; | |
} | |
if (! mb_check_encoding($username, 'UTF-8')) { | |
$error = 'not valid UTF8!'; | |
return false; | |
} | |
if (preg_match('/[\ ]{2,}/u', $username)) { | |
$error = 'contains repeating spaces!'; | |
return false; | |
} | |
if (! preg_match('/^[[:alnum:]\ \-\_]+$/u', $username)) { | |
$error = 'contains invalid characters!'; | |
return false; | |
} | |
$mblen = mb_strlen($username, 'UTF-8'); | |
if ($mblen < $min_len) { | |
$error = 'username too short, must be at least {$min_len} character(s).'; | |
return false; | |
} | |
if ($mblen > $max_len) { | |
$error = 'username long. can be no longer than {$max_len} character(s).'; | |
return false; | |
} | |
//todo: check for duplicate username | |
$error = ''; | |
return true; | |
} | |
function generatePassword(int $length = 14, bool $lowercase = true, bool $uppercase = true, bool $numbers = true, string $additionalCharacters = ""): string | |
{ | |
if ($length < 0) { | |
throw new \InvalidArgumentException("length must be >=0"); | |
} | |
// the following are omitted from dict, because | |
// they can be easily confused in some fonts: 1IloO0 | |
$dict = ""; | |
if ($lowercase) { | |
// omitted lo | |
$dict .= "abcdefghijkmnpqrstuvwxyz"; | |
} | |
if ($uppercase) { | |
// omitted IO | |
$dict .= "ABCDEFGHJKLMNPQRSTUVWXYZ"; | |
} | |
if ($numbers) { | |
// omitted 01 | |
$dict .= "23456789"; | |
} | |
if (strlen($additionalCharacters) > 0) { | |
$dict .= $additionalCharacters; | |
} | |
if (strlen($dict) < 1) { | |
throw new \InvalidArgumentException("at least one of lowercase, uppercase, numbers, or addictionalCharacters must be supplied"); | |
} | |
$randmax = strlen($dict) - 1; | |
$ret = ''; | |
for ($i = 0; $i < $length; ++ $i) { | |
$ret .= $dict[random_int(0, $randmax)]; | |
} | |
return $ret; | |
// return substr ( strtr ( base64_encode ( random_bytes ( $len ) ), '+/', '-_' ), 0, $len ); | |
} | |
public function guess_mime(): string | |
{ | |
if ($this->mime_string !== null) { | |
return $this->mime_string; | |
} | |
if (true) { | |
// fsck it.. | |
$this->mime_string = "text/plain; charset=UTF-8"; | |
return $this->mime_string; | |
} | |
if ($this->paste_type === $this::PASTE_TYPE_FILE) { | |
$cmd = implode(" ", array( | |
'file', | |
'--brief', | |
'--mime-type', | |
'--mime-encoding', | |
escapeshellarg($this->upload_uri) | |
)); | |
$this->mime_string = shell_exec($cmd); | |
return $this->mime_string; | |
} elseif ($this->paste_type === $this::PASTE_TYPE_STRING) { | |
// o god, unidirectional communication is so much | |
// more difficult, shell_exec() is way easier than proc_open | |
$cmd = implode(" ", array( | |
'file', | |
'--brief', | |
'--mime-type', | |
'--mime-encoding', | |
'-' | |
)); | |
$descriptorspec = array( | |
0 => array( | |
"pipe", | |
"rb" | |
), | |
1 => array( | |
"pipe", | |
"wb" | |
), | |
2 => array( | |
"pipe", | |
"wb" | |
) | |
); | |
$pipes = []; | |
$proc = proc_open($cmd, $descriptorspec, $pipes); | |
try { | |
fwrite_all($pipes[0], $this->content); | |
} catch (\Throwable $ex) { | |
// we couldn't write all, but we don't care. | |
unset($ex); | |
} | |
fclose($pipes[0]); | |
unset($pipes[0]); | |
stream_set_blocking($pipes[1], true); | |
$stdout = ""; | |
while (($status = proc_get_status($proc))["running"]) { | |
$tmp = stream_get_contents($pipes[1]); | |
if (is_string($tmp)) { | |
$stdout .= $tmp; | |
} | |
} | |
unset($status); | |
$tmp = stream_get_contents($pipes[1]); | |
if (is_string($tmp)) { | |
$stdout .= $tmp; | |
} | |
fclose($pipes[1]); | |
$stderr = stream_get_contents($pipes[2]); | |
fclose($pipes[2]); | |
unset($pipes[1], $pipes[2], $pipes); | |
if (! empty($stderr)) { | |
throw new \LogicException("file wrote to stderr, wtf? cmd: {$cmd} stderr: {$stderr} "); | |
} | |
proc_close($proc); | |
$this->mime_string = $stdout; | |
return $this->mime_string; | |
} | |
throw new \LogicException("unknown paste type!"); | |
} | |
} | |
/** | |
* load HTML as UTF-8, but do not create empty "white space nodes", | |
* for example <head> <title></title></head> will normally create 3 nodes, one node to remember that there is a space between <head> and <title> ... | |
* this function does not remember the spaces.. which makes DOM navigation easier, and more like how it works in web browsers | |
* (eg browser and this function: head->firstChild. DOMDocument: head->firstChild->nextSibling.) | |
* | |
* @param string $html | |
* @param int $extra_flags for DOMDocument::loadHTML | |
* @param int $exclude_flags exclude flags from DOMDocument::loadHTML | |
* @return \DOMDocument | |
*/ | |
function loadHTML_noemptywhitespace(string $html, int $extra_flags = 0, int $exclude_flags = 0): \DOMDocument | |
{ | |
$flags = LIBXML_HTML_NODEFDTD | LIBXML_NOBLANKS | LIBXML_NONET; | |
$flags = ($flags & ~ $exclude_flags) | $extra_flags; | |
$domd = new \DOMDocument(); | |
$domd->preserveWhiteSpace = false; | |
@$domd->loadHTML('<?xml encoding="UTF-8">' . $html, $flags); | |
$removeAnnoyingWhitespaceTextNodes = function (\DOMNode $node) use (&$removeAnnoyingWhitespaceTextNodes): void { | |
if ($node->hasChildNodes()) { | |
// Warning: it's important to do it backwards; if you do it forwards, the index for DOMNodeList might become invalidated; | |
// that's why i don't use foreach() - don't change it (unless you know what you're doing, ofc) | |
for ($i = $node->childNodes->length - 1; $i >= 0; --$i) { | |
$removeAnnoyingWhitespaceTextNodes($node->childNodes->item($i)); | |
} | |
} | |
if ($node->nodeType === XML_TEXT_NODE && !$node->hasChildNodes() && !$node->hasAttributes() && ! strlen(trim($node->textContent))) { | |
//echo "Removing annoying POS"; | |
// var_dump($node); | |
$node->parentNode->removeChild($node); | |
} //elseif ($node instanceof DOMText) { echo "not removed"; var_dump($node, $node->hasChildNodes(), $node->hasAttributes(), trim($node->textContent)); } | |
}; | |
$removeAnnoyingWhitespaceTextNodes($domd); | |
return $domd; | |
} | |
/** | |
* fetch all urls in parallel, | |
* warning: all urls must be unique.. | |
* | |
* @param array $urls_unique | |
* urls to fetch | |
* @param int $max_connections | |
* (optional, default unlimited) max simultaneous connections | |
* (some websites will auto-ban you for "ddosing" if you send too many requests simultaneously, | |
* and some wifi routers will get unstable on too many connectionis.. ) | |
* @param array $additional_curlopts | |
* (optional) set additional curl options here, each curl handle will get these options | |
* @throws RuntimeException on curl_multi errors | |
* @throws RuntimeException on curl_init() / curl_setopt() errors | |
* @return array(url=>response,url2=>response2,...) | |
*/ | |
function curl_fetch_multi_2(array $urls_unique, int $max_connections = 100, array $additional_curlopts = null) | |
{ | |
// $urls_unique = array_unique($urls_unique); | |
$ret = array(); | |
$mh = curl_multi_init(); | |
// $workers format: [(int)$ch]=url | |
$workers = array(); | |
$max_connections = min($max_connections, count($urls_unique)); | |
$unemployed_workers = array(); | |
for ($i = 0; $i < $max_connections; ++ $i) { | |
$unemployed_worker = curl_init(); | |
if (! $unemployed_worker) { | |
throw new \RuntimeException("failed creating unemployed worker #" . $i); | |
} | |
$unemployed_workers[] = $unemployed_worker; | |
} | |
unset($i, $unemployed_worker); | |
$work = function () use (&$workers, &$unemployed_workers, &$mh, &$ret): void { | |
assert(count($workers) > 0, "work() called with 0 workers!!"); | |
$still_running = null; | |
for (;;) { | |
do { | |
$err = curl_multi_exec($mh, $still_running); | |
} while ($err === CURLM_CALL_MULTI_PERFORM); | |
if ($err !== CURLM_OK) { | |
$errinfo = [ | |
"multi_exec_return" => $err, | |
"curl_multi_errno" => curl_multi_errno($mh), | |
"curl_multi_strerror" => curl_multi_strerror($err) | |
]; | |
$errstr = "curl_multi_exec error: " . str_replace([ | |
"\r", | |
"\n" | |
], "", var_export($errinfo, true)); | |
throw new \RuntimeException($errstr); | |
} | |
if ($still_running < count($workers)) { | |
// some workers has finished downloading, process them | |
// echo "processing!"; | |
break; | |
} else { | |
// no workers finished yet, sleep-wait for workers to finish downloading. | |
// echo "select()ing!"; | |
curl_multi_select($mh, 1); | |
// sleep(1); | |
} | |
} | |
while (false !== ($info = curl_multi_info_read($mh))) { | |
if ($info['msg'] !== CURLMSG_DONE) { | |
// no idea what this is, it's not the message we're looking for though, ignore it. | |
continue; | |
} | |
if ($info['result'] !== CURLM_OK) { | |
$errinfo = [ | |
"effective_url" => curl_getinfo($info['handle'], CURLINFO_EFFECTIVE_URL), | |
"curl_errno" => curl_errno($info['handle']), | |
"curl_error" => curl_error($info['handle']), | |
"curl_multi_errno" => curl_multi_errno($mh), | |
"curl_multi_strerror" => curl_multi_strerror(curl_multi_errno($mh)) | |
]; | |
$errstr = "curl_multi worker error: " . str_replace([ | |
"\r", | |
"\n" | |
], "", var_export($errinfo, true)); | |
throw new \RuntimeException($errstr); | |
} | |
$ch = $info['handle']; | |
$ch_index = (int) $ch; | |
$url = $workers[$ch_index]; | |
$ret[$url] = curl_multi_getcontent($ch); | |
unset($workers[$ch_index]); | |
curl_multi_remove_handle($mh, $ch); | |
$unemployed_workers[] = $ch; | |
} | |
}; | |
$opts = array( | |
CURLOPT_URL => '', | |
CURLOPT_RETURNTRANSFER => 1, | |
CURLOPT_ENCODING => '' | |
); | |
if (! empty($additional_curlopts)) { | |
// i would have used array_merge(), but it does scary stuff with integer keys.. foreach() is easier to reason about | |
foreach ($additional_curlopts as $key => $val) { | |
$opts[$key] = $val; | |
} | |
} | |
foreach ($urls_unique as $url) { | |
while (empty($unemployed_workers)) { | |
$work(); | |
} | |
$new_worker = array_pop($unemployed_workers); | |
$opts[CURLOPT_URL] = $url; | |
if (! curl_setopt_array($new_worker, $opts)) { | |
$errstr = "curl_setopt_array failed: " . curl_errno($new_worker) . ": " . curl_error($new_worker) . " " . var_export($opts, true); | |
throw new RuntimeException($errstr); | |
} | |
$workers[(int) $new_worker] = $url; | |
curl_multi_add_handle($mh, $new_worker); | |
} | |
while (count($workers) > 0) { | |
$work(); | |
} | |
foreach ($unemployed_workers as $unemployed_worker) { | |
curl_close($unemployed_worker); | |
} | |
curl_multi_close($mh); | |
return $ret; | |
} | |
function strtobits(string $str): string | |
{ | |
$ret = ""; | |
for ($i = 0; $i < strlen($str); ++ $i) { | |
$ord = ord($str[$i]); | |
for ($bitnum = 7; $bitnum >= 0; -- $bitnum) { | |
if ($ord & (1 << $bitnum)) { | |
$ret .= "1"; | |
} else { | |
$ret .= "0"; | |
} | |
} | |
} | |
return $ret; | |
} | |
function bitstostr(string $bits):string{ | |
$bits = strrev($bits); // tood: fixme: this really isn't required | |
$ret = ""; | |
for($i=0;$i<strlen($bits);$i+=8){ | |
$chr = 0; | |
for($ii = 0; $ii <8; ++ $ii){ | |
if($bits[$i+$ii] === '1'){ | |
$chr = $chr | (1 << $ii); | |
} | |
} | |
$ret.=chr($chr); | |
} | |
$ret = strrev($ret); // todo: fimxe: not required.. | |
return $ret; | |
} | |
function git_diff_pretty_file_code(string $file_path, string $new_code):string | |
{ | |
// using a file avoids a theoretical issue that is difficult to explain: | |
// git reach end of file_path content but not end of stdin content, | |
// and starts writing everything to stdout as diff, but the stdout buffer is full, | |
// meanwhile php is waiting for proc_close() before running stream_get_contents($pipes[1]), | |
// which could possibly result in a deadlock | |
// (git wait for php to read stdout buffer, and php wait for git to exit) | |
$stdout_handle = tmpfile(); | |
$stderr_handle = tmpfile(); | |
$descriptorspec = array( | |
0 => array( | |
"pipe", | |
"rb" | |
), | |
1 => $stdout_handle, | |
2 => $stderr_handle | |
); | |
$cmd = implode(" ", array( | |
"git diff", | |
"--text", | |
"--no-index", | |
"--color=always", | |
"--exit-code", | |
escapeshellarg($file_path), | |
"-" | |
)); | |
$pipes = []; | |
$proc = proc_open($cmd, $descriptorspec, $pipes); | |
fwrite_all($pipes[0], $new_code); | |
fclose($pipes[0]); | |
$ret = proc_close($proc); | |
rewind($stderr_handle); | |
$stderr = stream_get_contents($stderr_handle); | |
fclose($stderr_handle); | |
rewind($stdout_handle); | |
$stdout = stream_get_contents($stdout_handle); | |
fclose($stdout_handle); | |
// thanks to "--exit-code", 0 means equal, 1 means they differ, anything else means error | |
// and without --exit-code, git sometimes unexpectedly returns 1 instead of 0, | |
// and i can't find any documentation on what | |
// "return 1 without --exit-code, and nothing to stderr" actually means.. | |
if ($ret !== 0 && $ret !== 1) { | |
var_export([ | |
'git stderr' => $stderr, | |
'git stdout' => $stdout, | |
'git ret' => $ret | |
]); | |
throw new \LogicException("git diff returned non-zero: " . var_export($ret, true)); | |
} | |
return $stdout; | |
} | |
/** | |
* create an absolute path, | |
* supporting both windows-style (for Arild) and linux-style paths, | |
* and ".." and "." | |
* dirs will end with a directory separator. | |
* | |
* @param string ...$args | |
* the arguments that will create the folder | |
* @throws \InvalidArgumentException if the path does not exist.. | |
* @return string absolute path | |
*/ | |
function pathify(bool $mustExist, string ...$args): string | |
{ | |
$end_with_dir_separator = (function () use (&$args): bool { | |
// php<7.3.0 hack to get array_key_last() | |
// (trying to stay php 7.2 compatible for now) | |
foreach ($args as $last_key => $unused) {} | |
unset($unused); | |
if (! isset($last_key)) { | |
return false; | |
} | |
return ($args[$last_key] === '/' || $args[$last_key] === '\\'); | |
})(); | |
$ret = str_replace([ | |
'\\', | |
'/' | |
], DIRECTORY_SEPARATOR, implode(DIRECTORY_SEPARATOR, $args)); | |
do { | |
$cpy = $ret; | |
$ret = strtr($ret, array( | |
'\\\\' => '\\', | |
'//' => '/' | |
)); | |
} while ($ret !== $cpy); | |
$cpy = $ret; | |
$ret = realpath($ret); | |
if (false === $ret) { | |
if ($mustExist) { | |
throw new \InvalidArgumentException("path does not exist: {$cpy}"); | |
} else { | |
$ret = $cpy; | |
} | |
} | |
if (strlen($ret) >= 2 && $end_with_dir_separator) { | |
$ret .= DIRECTORY_SEPARATOR; | |
} | |
return $ret; | |
} | |
/** | |
* convert seconds to a human-readable format | |
* example: 123456789.1 seconds | |
* brief=false: 3 years 10 months 28 days 21 hours 33 minutes 9.1 seconds | |
* brief=true: 3y10M28d21h33m9.1s | |
* default format is brief=false | |
* | |
* @param float $seconds | |
* @param bool $brief | |
* @return string | |
*/ | |
function secondsToHumanReadable(float $seconds, bool $brief = false): string | |
{ | |
$frac = $seconds - (int) $seconds; | |
if ($frac !== 0.0) { | |
$frac = explode(".", number_format($frac, 3, '.', ''))[1]; | |
if (1 || ! $brief) { | |
if ($frac === "000") { | |
$frac = 0.0; | |
} else { | |
$frac = rtrim($frac, '0'); | |
} | |
} | |
} | |
$seconds = (int) $seconds; | |
if ($seconds === 0) { | |
// special case | |
if ($brief) { | |
if ($frac !== 0.0) { | |
return "0." . $frac . "s"; | |
} else { | |
return "0s"; | |
} | |
} else { | |
if ($frac !== 0.0) { | |
return "0." . $frac . " seconds"; | |
} else { | |
return '0 seconds'; | |
} | |
} | |
} | |
$dtF = new \DateTime('@0'); | |
$dtT = new \DateTime("@$seconds"); | |
$ret = ''; | |
$diff = $dtF->diff($dtT); | |
foreach (array( | |
'y' => 'year', | |
'm' => 'month', | |
'd' => 'day', | |
'h' => 'hour', | |
'i' => 'minute', | |
's' => 'second' | |
) as $time => $timename) { | |
if ($diff->$time === 0) { | |
continue; | |
} | |
if ($brief) { | |
$ret .= $diff->$time; | |
if ($time === "s" && $frac !== 0.0) { | |
$ret .= "." . $frac; | |
} elseif ($time === "m") { | |
// month... | |
$time = "M"; | |
} elseif ($time === "i") { | |
// minute | |
$time = "m"; | |
} | |
$ret .= $time; | |
} else { | |
$ret .= $diff->$time; | |
if ($time === "s" && $frac !== 0.0) { | |
$ret .= "." . $frac; | |
} | |
$ret .= ' ' . $timename; | |
if ($diff->$time !== 1 && $diff->$time !== - 1) { | |
$ret .= 's'; | |
} | |
$ret .= ' '; | |
} | |
} | |
if (! $brief) { | |
$ret = substr($ret, 0, - 1); | |
} | |
return $ret; | |
} | |
/** | |
* converts bytes to human readable format | |
* | |
* @param int $bytes | |
* @return string | |
*/ | |
function bytesToHumanReadable(int $bytes): string | |
{ | |
if ($bytes >= 1024 * 1024 * 1024 * 1024) { | |
return number_format($bytes / (1024 * 1024 * 1024 * 1024), 2) . " TB"; | |
} | |
if ($bytes >= 1024 * 1024 * 1024) { | |
return number_format($bytes / (1024 * 1024 * 1024), 2) . " GB"; | |
} | |
if ($bytes >= 1024 * 1024) { | |
return number_format($bytes / (1024 * 1024), 2) . " MB"; | |
} | |
if ($bytes >= 1024) { | |
return number_format($bytes / (1024), 2) . " KB"; | |
} | |
return number_format($bytes, 2) . " B"; | |
} | |
function is_port_open(string $hostname, int $port, int &$errno = null, string &$errstr = null, float &$connect_time = null, int $timeout_seconds = 2): bool | |
{ | |
$errno = null; | |
$errstr = null; | |
$connect_time = microtime(true); | |
$fp = @fsockopen($hostname, $port, $errno, $errstr, $timeout_seconds); | |
$connect_time = microtime(true) - $connect_time; | |
if ($fp) { | |
fclose($fp); | |
return true; | |
} | |
return false; | |
} | |
/** | |
* 1 byte: chr() / pack(C) | |
* 2 bytes: pack(v) | |
* 3 bytes: ??? (i don't think pack can do it?) | |
* 4 bytes: pack(V) | |
* 5 bytes: ??? (i don't think pack can do it?) | |
* 6 bytes: ??? (i don't think pack can do it?) | |
* 7 bytes: ??? (i don't think pack can do it?) | |
* 8 bytes: pack(P) | |
* | |
* @param int $i | |
* @return string | |
*/ | |
function stupidpack_le(int $i): string | |
{ | |
return ($i === 0 ? "\x00" : rtrim(pack('P', $i),"\x00")); | |
$ret = [ | |
0 | |
]; | |
for (; $i > 0; -- $i) { | |
// we have to look for the first key that is <255 and increment it, and zero all the previous ones.. | |
// if they all are 255, then we have to zero them all and create a brand new one with value 1.. | |
for ($key = 0; $key < count($ret); ++ $key) { | |
if ($ret[$key] < 255) { | |
++ $ret[$key]; | |
continue 2; | |
} else { | |
$ret[$key] = 0; | |
} | |
} | |
$ret[] = 1; | |
} | |
$realRet = ""; | |
for ($key = 0; $key < count($ret); ++ $key) { | |
$realRet .= chr($ret[$key]); | |
} | |
return $realRet; | |
} | |
/** | |
* improvised/home-made/inferior replica of Laravel's dd/dump-and-die function for debugging, | |
* does pretty much the same as var_dump($args,__FILE__,__LINE__);die(); | |
* screenshot: https://i.imgur.com/5k0LrDL.png | |
* | |
* @param mixed ...$args | |
* @return never | |
*/ | |
function dd(...$args) | |
{ | |
$headers_sent = headers_sent(); | |
if (!$headers_sent) { | |
header("Content-Type: text/plain; charset=utf-8"); | |
} else { | |
echo "<pre>\n"; | |
} | |
$plainText = str_contains(var_export(headers_list(), true), 'text/plain;'); | |
$trace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 2); | |
$file = $trace[0]['file']; | |
$line = $trace[0]['line']; | |
$code = file($file); | |
$code = implode("\n", array_slice($code, max($line - 3, 0), 5)); | |
echo "dd() called from $file:$line\n"; | |
echo "code:\n"; | |
if ($plainText) { | |
echo $code, "\n"; | |
} else { | |
echo htmlspecialchars($code, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE | ENT_DISALLOWED, 'UTF-8', true), "<br/>\n"; | |
} | |
var_dump(...$args); | |
die(); | |
} | |
function pastebinit(string $str): string | |
{ | |
$len = strlen($str); | |
if ($len < 1) { | |
return 'paste is empty....'; // .... | |
} | |
$socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP); | |
if ($socket === false) { | |
throw new \Exception('socket_create() failed: ' . socket_strerror(socket_last_error())); | |
} | |
if (!socket_set_block($socket)) { | |
throw new \Exception('socket_set_block() failed: ' . socket_strerror(socket_last_error($socket))); | |
} | |
$result = socket_connect($socket, 'termbin.com', 9999); | |
if ($result === false) { | |
throw new \Exception('socket_connect() failed: ' . socket_strerror(socket_last_error($socket))); | |
} | |
while ($len > 0) { | |
//echo "writing.."; | |
$sent = socket_write($socket, $str, $len); | |
if ($sent === false || $sent < 1) { | |
throw new \RuntimeException("failed to write the last {$len} byte(s)!: " . socket_strerror(socket_last_error($socket))); | |
} | |
$str = substr($str, $sent); | |
$len -= $sent; | |
} | |
assert($len === 0); | |
socket_shutdown($socket, 1); // shutdown the socket for writing | |
$full_url = ''; | |
for (;;) { | |
$read = [$socket]; | |
$write = null; | |
$except = [$socket]; | |
$seconds = 5; | |
$ss = socket_select( | |
$read, | |
$write, | |
$except, | |
$seconds, | |
); | |
if ($ss === false) { | |
throw new \RuntimeException("socket_select() failed: " . socket_strerror(socket_last_error($socket))); | |
} | |
if ($ss === 0) { | |
//var_dump("ss is 0!",$ss,$read,$url_chunk,$full_url); | |
break; | |
} | |
$url_chunk = ""; | |
socket_recv($socket, $url_chunk, 99, MSG_DONTWAIT); | |
//var_dump($url_chunk,bin2hex($url_chunk)); | |
if (false === $url_chunk || strlen($url_chunk) < 1) { | |
//var_dump("empty urlchunk",$url_chunk); | |
break; | |
} | |
$full_url .= $url_chunk; | |
if (substr($url_chunk, -1) === "\x00") { | |
break; | |
} | |
} | |
socket_close($socket); | |
return rtrim($full_url); | |
} | |
function int_to_flags_code(int $flags): string | |
{ | |
$ret = '0'; | |
for ($i = 0; $i < (PHP_INT_SIZE * 8); ++$i) { | |
if ($flags & (1 << $i)) { | |
$ret .= " | (1 << $i)"; | |
} | |
} | |
if (strlen($ret) !== 1) { | |
$ret = substr($ret, strlen('0 | ')); | |
} | |
return $ret; | |
} | |
function fwrite_all($fp, string $data): void | |
{ | |
$total_len = strlen($data); | |
$remaining = $total_len; | |
$written_total = 0; | |
$fwrite_zero_counter = 0; | |
for (;;) { | |
$written_now = fwrite($fp, $data); | |
if ($written_now === 0) { | |
++$fwrite_zero_counter; | |
if ($fwrite_zero_counter > 100) { | |
$errstr = "fwrite failed after {$written_total}/{$total_len} bytes written: fwrite() returned 0 {$fwrite_zero_counter} times in a row."; | |
throw new RuntimeException($errstr); | |
} | |
} else { | |
$fwrite_zero_counter = 0; | |
} | |
if ($written_now === false) { | |
$errstr = "fwrite failed after {$written_total}/{$total_len} bytes written: " . print_r(error_get_last(), true); | |
throw new RuntimeException($errstr); | |
} | |
$remaining -= $written_now; | |
if ($remaining === 0) { | |
return; | |
} | |
$written_total += $written_now; | |
$data = substr($data, $written_now); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare ( strict_types = 1 ) | |
; | |
error_reporting ( ~ 0 ); | |
function exception_error_handler($severity, $message, $file, $line) { | |
if (! (error_reporting () & $severity)) { | |
// This error code is not included in error_reporting | |
return; | |
} | |
throw new ErrorException ( $message, 0, $severity, $file, $line ); | |
} | |
set_error_handler ( "exception_error_handler" ); | |
class curl_async_stuff { | |
private $proxy_list_start_port = 9000; | |
private $proxy_list = array (); | |
private $mh; | |
private $workers = [ ]; | |
private $url_cache = [ ]; | |
private $url_cache_db_handle; | |
private function __construct() { | |
for($proxynum = 0; $proxynum < 20; ++ $proxynum) { | |
// DataDirectory | |
$datadir = "tor_data_dirs" . DIRECTORY_SEPARATOR . $proxynum; | |
if (! is_dir ( $datadir )) { | |
mkdir ( $datadir, 0777, true ); | |
} | |
$datadir = realpath ( $datadir ); | |
$cmd = "tor --SOCKSPort " . ($this->proxy_list_start_port + $proxynum) . " --DataDirectory " . escapeshellarg ( $datadir ); | |
$descriptorspec = array ( | |
0 => array ( | |
"pipe", | |
"rb" | |
) // stdin will be inherited if we don't create it, we dont want that, so we create stdin just to close it. | |
// 1 => array("pipe", "wb"), // stdout is a pipe that the child will write to | |
// 2 => array("file", "/tmp/error-output.txt", "ab") // stderr is a file to write to | |
); | |
$pipes = [ ]; | |
$process = proc_open ( $cmd, $descriptorspec, $pipes ); | |
fclose ( $pipes [0] ); | |
unset ( $pipes ); | |
$this->proxy_list [$proxynum] = $process; | |
} | |
unset ( $proxynum, $descriptorspec, $pipes, $cmd, $process ); | |
for($i = 0; $i < 20; ++ $i) { | |
echo "waiting for tor setup...\n"; | |
sleep ( 1 ); | |
} | |
$this->mh = curl_multi_init (); | |
$cache_file = __FILE__ . ".url_cache.sqlite3"; | |
$this->url_cache_db_handle = new \PDO ( 'sqlite:' . $cache_file, '', '', array ( | |
\PDO::ATTR_EMULATE_PREPARES => false, | |
\PDO::ATTR_ERRMODE => \PDO::ERRMODE_EXCEPTION, | |
\PDO::ATTR_DEFAULT_FETCH_MODE => \PDO::FETCH_ASSOC | |
) ); | |
$this->url_cache_db_handle->exec ( 'CREATE TABLE IF NOT EXISTS | |
`urlcache` ( | |
id INTEGER PRIMARY KEY, | |
timestamp INTEGER, | |
url STRING UNIQUE, | |
html STRING | |
);' ); | |
// | |
// | |
} | |
function __destruct() { | |
echo "destructing!"; | |
$this->block_until_handle_count_max ( 0 ); | |
curl_multi_close ( $this->mh ); | |
foreach ( $this->proxy_list as $proxy ) { | |
proc_terminate ( $proxy ); | |
proc_close ( $proxy ); | |
} | |
} | |
public static function Instance(): self { | |
static $instance = null; | |
if ($instance === null) { | |
$instance = new self (); | |
} | |
return $instance; | |
} | |
public function fetch_async_callback(string $url, array $additional_curlopts = [ ], callable $finished_callback = null, int $max_parallel = 100): void { | |
$this->block_until_handle_count_max ( $max_parallel ); | |
$db = $this->url_cache_db_handle; | |
$cache = $db->query ( "SELECT html FROM urlcache WHERE url = " . $db->quote ( $url ) )->fetch (); | |
if (isset ( $cache ["html"] )) { | |
($finished_callback) ( $cache ["html"], null ); | |
return; | |
} | |
echo "fetching {$url} (not from cache)\n"; | |
$worker_arr = array ( | |
"handle" => curl_init ( $url ), | |
"url" => $url, | |
"finished_callback" => $finished_callback | |
); | |
$opts = array ( | |
CURLOPT_RETURNTRANSFER => 1, | |
CURLOPT_ENCODING => "", | |
CURLOPT_USERAGENT => "php/" . PHP_VERSION . " libcurl/" . curl_version () ["version"], | |
CURLOPT_PRE_PROXY => 'SOCKS5://127.0.0.1:' . (random_int ( $this->proxy_list_start_port, $this->proxy_list_start_port + (count ( $this->proxy_list ) - 1) )) | |
); | |
foreach ( $additional_curlopts as $key => $overwrite ) { | |
$opts [$key] = $overwrite; | |
} | |
curl_setopt_array ( $worker_arr ['handle'], $opts ); | |
curl_multi_add_handle ( $this->mh, $worker_arr ['handle'] ); | |
$this->workers [( int ) $worker_arr ['handle']] = $worker_arr; | |
} | |
public function block_until_handle_count_max(int $max): void { | |
echo "handle count: " . count ( $this->workers ) . "\n"; | |
while ( count ( $this->workers ) > $max ) { | |
$this->block_until_at_least_1_download_completed (); | |
} | |
} | |
public function block_until_at_least_1_download_completed(): void { | |
if (count ( $this->workers ) < 1) { | |
// ... | |
return; | |
} | |
$closed_at_least_1 = false; | |
for(;;) { | |
curl_multi_exec ( $this->mh, $still_running ); | |
while ( $info = curl_multi_info_read ( $this->mh ) ) { | |
if ($info ['msg'] !== CURLMSG_DONE) { | |
continue; | |
} | |
$closed_at_least_1 = true; | |
$handle = $info ['handle']; | |
$content = curl_multi_getcontent ( $handle ); | |
if (! empty ( $content ) && strlen ( $content ) > 1000 && false === strpos ( $content, 'Du har nu överskridit den mängd sökningar' )) { | |
$stm = $this->url_cache_db_handle->prepare ( "INSERT INTO urlcache (timestamp, url, html) VALUES(:timestamp,:url,:html); " ); | |
$stm->execute ( array ( | |
":timestamp" => time (), | |
"url" => $this->workers [( int ) $handle] ['url'], | |
":html" => $content | |
) ); | |
} else { | |
$rate_limit_reason = ""; | |
if (empty ( $content )) { | |
$rate_limit_reason = "no response..."; | |
} elseif (strlen ( $content ) <= 1000) { | |
$rate_limit_reason = "response was too small (" . strlen ( $content ) . " bytes)"; | |
} elseif (false !== strpos ( $content, 'Du har nu överskridit den mängd sökningar' )) { | |
$rate_limit_reason = "'Du har nu överskridit den mängd sökningar' "; | |
} else { | |
$rate_limit_reason = "UNREACHABLE CODE REACHED FIXME WTF"; | |
} | |
throw new \LogicException ( "rate-limited on " . $this->workers [( int ) $handle] ['url'] . ": " . $rate_limit_reason ); | |
} | |
$cb = $this->workers [( int ) $handle]; | |
if (isset ( $cb ["finished_callback"] )) { | |
($cb ['finished_callback']) ( $content, $handle ); | |
} | |
unset ( $this->workers [( int ) $handle] ); | |
curl_multi_remove_handle ( $this->mh, $handle ); | |
curl_close ( $handle ); | |
} | |
if ($closed_at_least_1) { | |
return; | |
} else { | |
curl_multi_select ( $this->mh ); | |
} | |
} | |
} | |
} | |
function get_all_companies_urls(): void { | |
$chars = "abcdefghijklmnopqrstuvwxyz0123456789"; | |
$responses = array (); | |
for($i = 0, $imax = strlen ( $chars ); $i < $imax; ++ $i) { | |
$url = 'https://www.merinfo.se/search?who=' . urlencode ( $chars [$i] ) . '&d=c&page=1'; | |
curl_async_stuff::Instance ()->fetch_async_callback ( $url, [ ], function ($html, $ch) use (&$responses, $url): void { | |
echo "loaded {$url}!\n"; | |
$responses [$url] = $html; | |
} ); | |
} | |
curl_async_stuff::Instance ()->block_until_handle_count_max ( 0 ); | |
$url_keys = array_keys ( $responses ); | |
foreach ( $url_keys as $urlkey ) { | |
$response = $responses [$urlkey]; | |
// Visar 1 | |
// till 20 | |
// av 7386 träffar. | |
$rex = '/Visar\s+(?<current_start_result>\d+)\s+till\s+(?<current_end_result>\d+)\s+av\s+(?<total_results>\d+)\s+träffar\./u'; | |
$matches = [ ]; | |
if (! preg_match ( $rex, $response, $matches )) { | |
throw new \LengthException ( "extract-match-count-rex failed on {$url}" ); | |
} | |
var_dump ( $matches ); | |
$total_pages = ( int ) floor ( (( int ) ($matches ["total_results"])) / 20 ); | |
for($pagenum = 2; $pagenum < $total_pages; ++ $pagenum) { | |
// https://www.merinfo.se/search?who=a&d=c&page=2 | |
$url = rtrim ( $urlkey, '0123456789' ) . $pagenum; | |
curl_async_stuff::Instance ()->fetch_async_callback ( $url, [ ], function ($response, $ch) use ($url, &$responses): void { | |
$responses [$url] = $response; | |
} ); | |
curl_async_stuff::Instance ()->block_until_handle_count_max ( 50 ); | |
} | |
} | |
curl_async_stuff::Instance ()->block_until_handle_count_max ( 0 ); | |
var_dump ( $responses [array_key_last ( $responses )] ); | |
} | |
get_all_companies_urls (); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment