Skip to content

Instantly share code, notes, and snippets.

@p182
Created November 4, 2021 23:55
Show Gist options
  • Save p182/1d722776620da229696804ea49f2ffdd to your computer and use it in GitHub Desktop.
Save p182/1d722776620da229696804ea49f2ffdd to your computer and use it in GitHub Desktop.
Copy snippet
<?php
/*Below a code snippet for downloading a file from a web server to a local file.
It demonstrates useful customizations of the request (such as setting a User-Agent and Referrer, often required by web sites), and how to download only files if the copy on the web site is newer than the local copy.
It further demonstrates the processing of response headers (if set by server) to determine the timestamp and file name. The file type is checked because some servers return a 200 OK return code with a textual "not found" page, instead of a proper 404 return code.
*/
// $fURI: URL to a file located on a web server
// $target_file: Path to a local file
if ( file_exists( $target_file ) ) {
$ifmodhdr = 'If-Modified-Since: '.date( "r", filemtime( $target_file ) )."\r\n";
}
else {
$ifmodhdr = '';
}
// set request header for GET with referrer for modified files, that follows redirects
$arrRequestHeaders = array(
'http'=>array(
'method' =>'GET',
'protocol_version' =>1.1,
'follow_location' =>1,
'header'=> "User-Agent: Anamera-Feed/1.0\r\n" .
"Referer: $source\r\n" .
$ifmodhdr
)
);
$rc = copy( $fURI, $target_file, stream_context_create($arrRequestHeaders) );
// HTTP request completed, preserve system error, if any
if( $rc ) {
if ( fclose( $rc ) ) {
unset( $err );
}
else {
$err = error_get_last();
}
}
else {
$err = error_get_last();
}
// Parse HTTP Response Headers for HTTP Status, as well filename, type, date information
// Need to start from rear, to get last set of headers after possible sets of redirection headers
if ( $http_response_header ) {
for ( $i = sizeof($http_response_header) - 1; $i >= 0; $i-- ) {
if ( preg_match('@^http/\S+ (\S{3,}) (.+)$@i', $http_response_header[$i], $http_status) > 0 ) {
// HTTP Status header means we have reached beginning of response headers for last request
break;
}
elseif ( preg_match('@^(\S+):\s*(.+)\s*$@', $http_response_header[$i], $arrHeader) > 0 ) {
switch ( $arrHeader[1] ) {
case 'Last-Modified':
if ( !isset($http_content_modtime) ) {
$http_content_modtime = strtotime( $arrHeader[2] );
}
break;
case 'Content-Type':
if ( !isset($http_content_image_type) ) {
if ( preg_match('@^image/(\w+)@ims', $arrHeader[2], $arrTokens) > 0 ) {
if ( in_array(strtolower($arrTokens[1]), $arrValidTypes)) {
$http_content_image_type = $arrTokens[1];
break;
}
}
throw new Exception( "Error accessing file $fURI; invalid content type: $arrHeader[2]", 2);
}
break;
case 'Content-Disposition':
if ( !isset($http_content_filename) && preg_match('@filename\\s*=\\s*(?|"([^"]+)"|([\\S]+));?@ims', $arrHeader[2], $arrTokens) > 0 ) {
$http_content_filename = basename($arrTokens[1]);
}
break;
}
}
}
}
if ( $http_status ) {
// Make sure we have good HTTP Status
switch ( $http_status[1] ) {
case '200':
// SUCCESS: HTTP Status is "200 OK"
break;
case '304':
throw new Exception( "Remote file not newer: $fURI", $http_status[1] );
break;
case '404':
throw new Exception( "Remote file not found: $fURI", $http_status[1] );
break;
default:
throw new Exception( "HTTP Error, $http_status[2], accessing $fURI", $http_status[1] );
break;
}
}
elseif ( $err ) {
// Protocol / Communication error
throw new Exception( $err['message']/*."; Remote file: $fURI"*/, $err['type'] );
}
else {
// No HTTP status and no error
throw new customException( "Unknown HTTP response accessing $fURI: $http_response_header[0]", -1 );
}
/*
Notes:
1. Currently copy() does NOT appropriately handle the 304 response code. Instead of NOT performing a copy (possibly setting the RC), it will overwrite the target file with an zero length file.
2. There may be a problem accessing a list of remote files when HTTP 1.1 protocol is used. If you experience time-out errors, try the default 1.0 protocol version.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment