Skip to content

Instantly share code, notes, and snippets.

@yosun
Last active December 12, 2021 10:33
Show Gist options
  • Save yosun/d1ef6ef56943bd2417b07f4970ff7447 to your computer and use it in GitHub Desktop.
Save yosun/d1ef6ef56943bd2417b07f4970ff7447 to your computer and use it in GitHub Desktop.
large json file tools php specific
<?php ini_set('memory_limit','512M');set_time_limit(0);
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);
require_once( 'vendor/autoload.php' );
// couldn't JsonMachines to work for poly_dump json (250M, with verbose objects)
//use \JsonMachine\JsonMachine;
//$string = file_get_contents("test.json");
//$data = JsonMachine::fromFile('metadata_unique_all.json');
use pcrov\JsonReader\JsonReader;
$testfile = __DIR__.'/metadata_unique_all.json';
$reader = new JsonReader();
$reader->open($testfile);
$reader->read();$reader->read();
//foreach ($data as $datum) { // replaced with
while($reader->type() === JsonReader::OBJECT) {
$datum = $reader->value();
// do stuff!
$reader->next();
}
$reader->close();
<?php
// check if links are valid ... for example before piping to s3
function url_exists($url) {
if (getHttpResponseCode_using_getheaders($url)==200)return 1;
return 0;
}
function getHttpResponseCode_using_curl($url, $followredirects = true){
// returns int responsecode, or false (if url does not exist or connection timeout occurs)
// NOTE: could potentially take up to 0-30 seconds , blocking further code execution (more or less depending on connection, target site, and local timeout settings))
// if $followredirects == false: return the FIRST known httpcode (ignore redirects)
// if $followredirects == true : return the LAST known httpcode (when redirected)
if(! $url || ! is_string($url)){
return false;
}
$ch = @curl_init($url);
if($ch === false){
return false;
}
@curl_setopt($ch, CURLOPT_HEADER ,true); // we want headers
@curl_setopt($ch, CURLOPT_NOBODY ,true); // dont need body
@curl_setopt($ch, CURLOPT_RETURNTRANSFER ,true); // catch output (do NOT print!)
if($followredirects){
@curl_setopt($ch, CURLOPT_FOLLOWLOCATION ,true);
@curl_setopt($ch, CURLOPT_MAXREDIRS ,10); // fairly random number, but could prevent unwanted endless redirects with followlocation=true
}else{
@curl_setopt($ch, CURLOPT_FOLLOWLOCATION ,false);
}
// @curl_setopt($ch, CURLOPT_CONNECTTIMEOUT ,5); // fairly random number (seconds)... but could prevent waiting forever to get a result
// @curl_setopt($ch, CURLOPT_TIMEOUT ,6); // fairly random number (seconds)... but could prevent waiting forever to get a result
// @curl_setopt($ch, CURLOPT_USERAGENT ,"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"); // pretend we're a regular browser
@curl_exec($ch);
if(@curl_errno($ch)){ // should be 0
@curl_close($ch);
return false;
}
$code = @curl_getinfo($ch, CURLINFO_HTTP_CODE); // note: php.net documentation shows this returns a string, but really it returns an int
@curl_close($ch);
return $code;
}
function getHttpResponseCode_using_getheaders($url, $followredirects = true){
// returns string responsecode, or false if no responsecode found in headers (or url does not exist)
// NOTE: could potentially take up to 0-30 seconds , blocking further code execution (more or less depending on connection, target site, and local timeout settings))
// if $followredirects == false: return the FIRST known httpcode (ignore redirects)
// if $followredirects == true : return the LAST known httpcode (when redirected)
if(! $url || ! is_string($url)){
return false;
}
$headers = @get_headers($url);
if($headers && is_array($headers)){
if($followredirects){
// we want the last errorcode, reverse array so we start at the end:
$headers = array_reverse($headers);
}
foreach($headers as $hline){
// search for things like "HTTP/1.1 200 OK" , "HTTP/1.0 200 OK" , "HTTP/1.1 301 PERMANENTLY MOVED" , "HTTP/1.1 400 Not Found" , etc.
// note that the exact syntax/version/output differs, so there is some string magic involved here
if(preg_match('/^HTTP\/\S+\s+([1-9][0-9][0-9])\s+.*/', $hline, $matches) ){// "HTTP/*** ### ***"
$code = $matches[1];
return $code;
}
}
// no HTTP/xxx found in headers:
return false;
}
// no headers :
return false;
}
<?php
// testing
/*
$filepath = 'https://poly.googleapis.com/downloads/fp/1602064148752848/4vds6twPsb7/c8Ksvo0_VjG/C:/Users/PC/Desktop/pom/pimSurface_Color.png';
$filenamewithextension = 'C:/Users/PC/Desktop/pom/pimSurface_Color.png';
*/
if(!isset($filepath))die('filepath?');
if(!isset($filenamewithextension))die('filenamewithextension not set');
$filenamewithextension = str_replace('\/','',$filenamewithextension);
require_once( 'vendor/autoload.php' );
use Aws\S3\S3Client;
if(url_exists($filepath)){
$credentials = new Aws\Credentials\Credentials(awsAccessKey, awsSecretKey);
$s3 = new S3Client([
'version' => 'latest',
'region' => 'us-east-1',
'credentials'=>$credentials]);
if(!isset($bucketName))$bucketName = 'polydumpcurated';
@$s3->putObject([
'Key'=>$filenamewithextension,
'Bucket'=>$bucketName,
'Body'=>@file_get_contents($filepath)
]);
}
//unset($s3);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment