Skip to content

Instantly share code, notes, and snippets.

@beporter
Last active July 7, 2017 19:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beporter/49410eee41c926b90d19313871851595 to your computer and use it in GitHub Desktop.
Save beporter/49410eee41c926b90d19313871851595 to your computer and use it in GitHub Desktop.
(Crudely) collect Amazon ASINs from URLs
<?php
/**
* Quick script to extract ASINs from Amazon URLs.
*
* @see http://stackoverflow.com/a/12827734/70876
*
* beporter at users dot sourceforge dot net
* 2016-05-18
*/
//--------------------------------------
function findAsins($i) {
return array_filter(
explode('/', parse_url($i, PHP_URL_PATH)),
function ($s) { return preg_match('/^(B\d{2}\w{7}|\d{9}(X|\d))$/', $s); }
);
}
//--------------------------------------
// main()
$inputs = [
'http://www.amazon.com/gp/product/B00H00OG7G/ref=s9_simh_gw_g23_i2_r?ie=UTF8&fpl=fresh&pf_rd_m=ATVPDKIKX0DER&pf_rd_s=&pf_rd_r=1T4A8079XNP9A8ZH73RT&pf_rd_t=36701&pf_rd_p=5d23eaf6-6278-49c1-b6df-7de0cb9b3a26&pf_rd_i=desktop',
'/Rust-Oleum-260357-Touch-Polyurethane-Matte/dp/B00714ZO22/more-shit/so-much/shit/',
'Touch-Polyurethane-Matte/dp/B00714ZO22',
'www.amazon.cn/电脑-it-办公/dp/B00YARCGHO/',
'/B00YCCCCCC/ALSO10NUMS/?something-else',
'Polyurethane/dp/B00RZYBZ9Y?foo=bar',
];
foreach ($inputs as $i) {
// array_pop() is us assuming the "last" of multiple matches in the URL is correct.
echo array_pop((findAsins($i))) . PHP_EOL;
}
/*--------------------------------------
Output:
$ php asins.php
B00H00OG7G
B00714ZO22
B00714ZO22
B00YARCGHO
B00YCCCCCC
B00RZYBZ9Y
*/
#!/usr/bin/env php
<?php
/**
* This script takes two arguments: A path to a CSV file and a column name
* from which to extract ASINs. It will return a quoted, comma separated
* string of all found ASINs, suitable for use in an SQL query's `IN (...)`
* clause.
*
* Example:
* $ ./asin-extract.php a_file.csv columnName
*/
/**
* Returns an array of all path-separated ASINs matched in a provided URL.
*
* @param string $s A URL possibly containing one or more `/ASIN/` path segment.
* @return array All matched ASINs, in order they were found in the URL (left to right).
*/
function findAsins($s) {
return array_filter(
explode('/', parse_url($s, PHP_URL_PATH)),
function ($s) { return preg_match('/^(B\d{2}\w{7}|\d{9}(X|\d))$/', $s); }
);
}
/**
* main()
*/
if ($argc < 3) {
die('Must supply CSV path as first argument and column name containing URLs with ASINs as second argument.');
}
$path = $argv[1];
$column = $argv[2];
if (!is_readable($path)) {
die('Supplied CSV path is not readable.');
}
ini_set('auto_detect_line_endings', true);
$csv = new \SplFileObject($path);
$csv->setFlags(SplFileObject::READ_CSV);
$headers = $csv->current();
if (!in_array($column, $headers)) {
die(sprintf('First row of CSV file does not contain a column named `%s`.', $column));
}
$columnPosition = array_search($column, $headers);
$foundAsins = [];
foreach ($csv as $row) {
$foundAsins = array_merge($foundAsins, findAsins($row[$columnPosition]));
}
foreach (array_chunk($foundAsins, 8) as $chunk) {
echo '"' . implode('","', $chunk) . '"' . PHP_EOL;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment