Last active
February 17, 2017 04:57
-
-
Save yoya/de609ca73d094fc609da6a8c853338f4 to your computer and use it in GitHub Desktop.
Metropolitan Museum of Art Public Images Downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
* Metropolitan Museum of Art Public Images Downloader | |
* (c) 2017/02/17(Fri) yoya@awm.jp | |
* Usage) php getMetImages.php MetObjects.csv | |
*/ | |
$file = new SplFileObject($argv[1]); | |
$file->setFlags(SplFileObject::READ_CSV | SplFileObject::READ_AHEAD | SplFileObject::SKIP_EMPTY | SplFileObject::DROP_NEW_LINE); | |
$idx = 0; | |
foreach ($file as $record) { | |
if (is_null($record[0])) { | |
continue; | |
} | |
foreach ($record as $i => $value) { | |
if (substr($value, 0, 3) === "\xef\xbb\xbf") { // <U+FEFF> | |
$value = substr($value, 3); | |
} | |
$record[$i] = trim($value); | |
} | |
if ($idx === 0) { | |
$keys = $record; | |
} else { | |
echo "$idx".PHP_EOL; | |
$new_record = []; | |
foreach ($record as $i => $value) { | |
$new_record[$keys[$i]] = $value; | |
} | |
$record = $new_record; | |
/* | |
* main routine | |
*/ | |
$isPublicDomain = $record["Is Public Domain"]; | |
if ($isPublicDomain !== "False") { | |
if ($isPublicDomain !== "True") { | |
$errMesg = "Unknown Is Public Domain flag:".$isPublicDomain; | |
trigger_error($errMesg, E_USER_ERROR); | |
} | |
$linkResource = $record["Link Resource"]; | |
echo $linkResource.PHP_EOL; | |
$html = file_get_contents($linkResource); | |
file_put_contents("$idx.html", $html); | |
if (preg_match("(http://.+/original/.*\.jpg)", $html, $matches) || | |
preg_match("(http://.+/original/.*\.png)", $html, $matches)) { | |
$originalUrl = $matches[0]; | |
echo $originalUrl.PHP_EOL; | |
`wget $originalUrl`; | |
} | |
} | |
} | |
$idx ++; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment