Skip to content

Instantly share code, notes, and snippets.

@mvolfik
Created June 20, 2022 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mvolfik/a0875528358835cdf041fc126f155591 to your computer and use it in GitHub Desktop.
Save mvolfik/a0875528358835cdf041fc126f155591 to your computer and use it in GitHub Desktop.
For a given shopify store, crawl a list of products from the API, and from sitemaps, and compare if both contain the same products
import { gotScraping } from 'got-scraping';
const fetcher = gotScraping.extend({
proxyUrl: process.env.PROXY_URL,
retry: { limit: 3 },
});
async function main() {
let baseUrl;
try {
baseUrl = new URL(process.argv[2]);
} catch (e) {
console.error(
'Provide a Shopify store URL as a command line argument (including `http(s)://`)',
);
process.exit(1);
}
baseUrl.pathname = '/';
const robotsResponse = await fetcher.get(new URL('/robots.txt', baseUrl));
const robots = robotsResponse.body;
if (!robots.includes('# we use Shopify as our ecommerce platform')) {
console.error("This doesn't seem to be a Shopify store");
process.exit(1);
}
const rootSitemapResponse = await fetcher.get(
new URL('/sitemap.xml', baseUrl),
);
const rootSitemap = rootSitemapResponse.body;
const productsFromSitemap = new Set();
const sitemapPromises = [
...rootSitemap
.replaceAll('&', '&')
.matchAll(/<loc>(https?:\/\/[^/]+\/sitemap_products_.*)<\/loc>/g),
].map(async (match) => {
const url = match[1];
const sitemapResponse = await fetcher.get(url);
console.log(`Fetched sitemap ${url}`);
const sitemap = sitemapResponse.body;
for (const productMatch of sitemap.matchAll(
/<loc>https?:\/\/[^/]+\/products\/(.*)<\/loc>/g,
)) {
productsFromSitemap.add(productMatch[1]);
}
});
await Promise.all(sitemapPromises);
const productsFromAPI = new Set();
let i = 1;
while (true) {
const oldSize = productsFromAPI.size;
const promises = [];
const newI = Math.ceil(i * 1.3);
for (let j = i; j < newI; j++) {
promises.push(
fetcher
.get(new URL(`/products.json?page=${j}`, baseUrl))
.then((response) => {
console.log(
`Fetched products page ${response.requestUrl.toString()}`,
);
const { products } = JSON.parse(response.body);
for (const product of products) {
productsFromAPI.add(product.handle);
}
}),
);
}
await Promise.all(promises);
console.log();
i = newI;
if (productsFromAPI.size === oldSize) {
break;
}
}
const productsFromSitemapArray = [...productsFromSitemap.values()].sort();
const productsFromAPIArray = [...productsFromAPI.values()].sort();
const notInSitemap = [];
const notInAPI = [];
let sitemap_i = 0;
let API_i = 0;
let inBoth = 0;
while (true) {
if (
sitemap_i >= productsFromSitemapArray.length
&& API_i >= productsFromAPIArray.length
) {
break;
}
if (sitemap_i >= productsFromSitemapArray.length) {
notInSitemap.push(productsFromAPIArray[API_i++]);
} else if (API_i >= productsFromAPIArray.length) {
notInAPI.push(productsFromSitemapArray[sitemap_i++]);
} else if (
productsFromSitemapArray[sitemap_i] < productsFromAPIArray[API_i]
) {
notInAPI.push(productsFromSitemapArray[sitemap_i++]);
} else if (
productsFromSitemapArray[sitemap_i] > productsFromAPIArray[API_i]
) {
notInSitemap.push(productsFromAPIArray[API_i++]);
} else {
sitemap_i++;
API_i++;
inBoth++;
}
}
console.log('Missing from sitemap:', notInSitemap);
console.log('Missing from API:', notInAPI);
console.log('In both:', inBoth);
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment