Skip to content

Instantly share code, notes, and snippets.

@peolic
Last active August 9, 2021 23:11
Show Gist options
  • Save peolic/eb9e8d5e565ccd8efad245ee3e1a4469 to your computer and use it in GitHub Desktop.
Save peolic/eb9e8d5e565ccd8efad245ee3e1a4469 to your computer and use it in GitHub Desktop.
#!/usr/bin/env node
// @ts-check
'use strict';
// Execute with working directory @ root of CommunityScrapers
// Dependency: https://npmjs.com/package/yaml
const fs = require('fs');
const path = require('path');
const _moduleSearchPaths = (mod) => ([
path.resolve(process.cwd(), './validator/node_modules'),
...(require.resolve.paths(mod) || []),
]);
const _yamlModule = require.resolve('yaml', { paths: _moduleSearchPaths('yaml') });
const YAML = require(_yamlModule);
// -------------
// Configuration
// -------------
/**
* Separate Markdown table columns using ` | ` instead of `|`?
*/
const addSpacesBetweenColumns = false;
/**
* Array of prefixes to remove from all hosts.
* @type {string[]}
*/
const unwantedPrefixes = [
'en',
'free',
'new',
'tour',
'www',
];
/**
* A list of sites that have not-standard URLs in the list.
* Example, for source url `www.mypornsite.xxx/scenes/`:
* Keys are the generated hosts to override, can be either one of:
* (note that some prefixes are always stripped from the hostname, check `unwantedPrefixes` above)
* 1. Hostname = `mypornsite.xxx`
* 2. Hostname + Pathname = `mypornsite.xxx/scenes/`
* Values can be either a string or a list of strings to override the host with.
* @type {{ [site: string]: string|string[] }}
*/
const siteNameOverride = {
'api.metadataapi.net': 'metadataapi.net (JSON API)',
'enasianmusume.kin8tengoku.com': 'kin8tengoku.com',
'metadataapi.net': 'metadataapi.net (URL)',
'mgstage.com': 'www.mgstage.com',
'newsensations.com/tour_ns/dvds': 'newsensations.com/tour_ns/',
'newsensations.com/tour_ns/updates': 'newsensations.com/tour_ns/',
'newsensations.com/tour_rs/': 'newsensations.com/tour_rs/',
'purgatoryx.com': 'tour.purgatoryx.com',
'trans500.com/tour/': 'trans500.com/tour/',
'trans500.com/tour3/': 'trans500.com/tour3/',
'trans500.com/tourespanol': 'trans500.com/tourespanol',
'wicked.com/en/movie/': 'wicked.com (/movies)',
};
/**
* @type {{ [lookupKey: string]: string[] }}
*/
const scriptSites = {
'AdultimeAPI.yml|scene': ['adultime.com'],
'IAFD.yml|performer': ['iafd.com'],
'JacquieEtMichelTV.yml|scene': ['jacquieetmicheltv.net'],
'JavLibrary_python.yml|scene': [],
'MindGeekAPI.yml|scene': [],
'multiscrape.yml|performer': [],
'performer-image-dir.yml|performer': [],
'stash-sqlite.yml|performer': [],
'torrent.yml|scene': [],
'xbvrdb.yml|scene': [],
};
// -------------
/**
* @typedef ScraperHost
* @property {string} host
* @property {boolean} [scene]
* @property {boolean} [performer]
* @property {boolean} [movie]
* @property {boolean} [gallery]
* @property {boolean} usesPython
* @property {boolean} usesNode
*/
/**
* @typedef Scraper
* @property {string} fileName
* @property {ScraperHost[]} hosts
* @property {boolean} usesCDP
*/
const columnSep = addSpacesBetweenColumns ? ' | ' : '|';
const escapeRegex = (/** @type {string} */string) => string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
const prefixes = unwantedPrefixes.map(escapeRegex).join('|');
const prefixesPattern = new RegExp(String.raw`^(${prefixes})\.`);
/**
* Make valid hosts for the scrapers list from a partial URL string.
* @param {string} url
* @returns {string[]}
*/
const makeHosts = (url) => {
const urlObj = new URL(url.replace(/^(?!https?:)/, 'http://'));
const hostname = urlObj.hostname.replace(prefixesPattern, '');
const overrideKeys = [
hostname,
hostname + urlObj.pathname,
];
let result = undefined;
let idx = 0;
while (idx < overrideKeys.length && result === undefined) {
result = siteNameOverride[overrideKeys[idx]];
idx++;
}
if (result === undefined) {
result = [hostname];
}
return result instanceof Array ? result : [result];
};
const queryURLVariablePattern = /(?<=\{)([a-z]+)(?!=\})/g;
/**
* @param {string} [queryURL]
* @param {Object.<string, { regex: string, with: string }[]>} [queryURLReplace]
*/
const parseQueryURL = (queryURL, queryURLReplace) => {
if (!queryURL) {
return [];
}
if (!queryURLReplace) {
return [queryURL];
}
const urls = [];
queryURL.match(queryURLVariablePattern).forEach((key) => {
const keyPattern = new RegExp(String.raw`\{${key}\}`, 'g');
if (!queryURLReplace[key]) return;
queryURLReplace[key].forEach(({ regex, with: repl }) => {
if (regex.trim() === '$') return;
urls.push(queryURL.replace(keyPattern, repl));
});
});
return urls;
};
/**
* Zip two arrays of equal length.
* @param {any[]} a
* @param {any[]} b
*/
const zip = (a, b) => a.map((k, i) => [k, b[i]]);
const listPath = path.resolve(process.cwd(), './SCRAPERS-LIST.md');
/**
* @returns {string[]}
*/
const getScrapers = () => {
const scrapersDir = path.resolve(process.cwd(), './scrapers');
return fs.readdirSync(scrapersDir).reduce(
(acc, fname) => (fname.endsWith('.yml') ? acc.concat(path.join(scrapersDir, fname)) : acc),
[]
);
};
let markdownHeader = '';
// Values here are only a fallback, the script grabs the current columns from the list below.
/** @type {{align: string, title: string}[]} */
let tableColumns = [
{align: '' , title: 'Supported Site'},
{align: '' , title: 'Scraper'},
{align: '^', title: 'S'},
{align: '^', title: 'G'},
{align: '^', title: 'M'},
{align: '^', title: 'P'},
{align: '^', title: 'Needs'},
{align: '^', title: 'Contents'},
];
const knownNeeds = [
'cdp',
// 'python3',
// 'node',
];
/**
* @returns {[Object.<string, string[]>, Object.<string, string>]}
*/
const _getCurrentData = () => {
/** @type {Object.<string, string[]>} */
const needsMap = {};
/** @type {Object.<string, string>} */
const contentMap = {};
const md = fs.readFileSync(listPath, 'utf8');
const lines = md.split(/\r?\n/g);
const tableHeader = lines.findIndex((line) => (line.match(/\|/g) || []).length === (tableColumns.length - 1));
markdownHeader = lines.slice(0, tableHeader).join('\n');
/**
* Joined column titles and headers.
* @type {[string, string][]}
*/
// @ts-ignore
const tableColumnHeaders = zip(...lines.slice(tableHeader, tableHeader + 2).map((h) => h.split('|')));
/** @type {{align: string, title: string}[]} */
const currentTableColumns = tableColumnHeaders.map(([title, header]) => {
title = title.trim();
header = header.trim();
const left = header.slice(0, 1);
const right = header.slice(-1);
const align = (left === ':' && right === ':') ? '^' : (right === ':') ? '>' : (left === ':') ? '<' : '';
return { align, title };
});
if (currentTableColumns.length > 0) {
tableColumns = currentTableColumns;
}
lines.slice(tableHeader + 2).forEach((line) => {
const [
host,
, // fileName,
, // scenes,
, // gallery,
, // movies,
, // performers,
needs,
contents,
] = line.split('|');
if (typeof contents === 'string' && contents.trim() !== '-') {
contentMap[host.trim()] = contents.trim();
}
if (typeof needs === 'string' && needs.trim() !== '-') {
needsMap[host.trim()] = needs
.trim()
.split(', ')
.filter((n) => !knownNeeds.includes(n.toLowerCase()));
}
});
return [needsMap, contentMap];
};
const [hostNeedsMap, hostContentMap] = _getCurrentData();
/**
* @param {typeof tableColumns[number]} column
* @returns {string}
*/
const makeColumnHeader = ({ align, title }) => {
if (!align) {
return '-'.repeat(Math.max(1, title.length));
}
const left = (align === '^' || align === '<') ? ':' : '';
const right = (align === '^' || align === '>') ? ':' : '';
const len = title.length - (left ? 1 : 0) - (right ? 1 : 0);
return left + '-'.repeat(Math.max(1, len)) + right;
};
/**
* @param {string[][]} accumulator
* @param {typeof tableColumns[number]} column
* @param {number} idx
* @returns {string[][]}
*/
const tableHeaderReducer = ([titles, headers], column, idx) => {
const columnHeader = makeColumnHeader(column);
const len = column.title.length;
const max = columnHeader.length;
if (len >= max) {
titles.push(column.title);
} else {
// Center
titles.push(
column.title
.padStart(len + Math.floor((max - len) / 2), ' ')
.padEnd(max, ' ')
);
}
return [titles, headers.concat(columnHeader)];
};
/**
* Icons to use
* @param {boolean} val
*/
const getIcon = (val) => val ? ':heavy_check_mark:' : ':x:';
/**
* @param {Scraper} scraper
*/
const makeTableEntry = (scraper) => {
const { fileName, hosts, usesCDP } = scraper;
return hosts.map((hostObj) => {
const { host } = hostObj;
const scene = getIcon(!!hostObj['scene']);
const gallery = getIcon(!!hostObj['gallery']);
const movie = getIcon(!!hostObj['movie']);
const performer = getIcon(!!hostObj['performer']);
/** @type {string[]} */
const needsArray = [].concat(hostNeedsMap[host] || []);
// Anything handled here should be added to the `knownNeeds` array.
(usesCDP) && needsArray.push('CDP');
// (hostObj.usesPython) && needsArray.push('python3');
// (hostObj.usesNode) && needsArray.push('node');
const needs = (needsArray.length > 0) ? needsArray.join(', ') : '-';
// Content not available in scraper config files,
// so use the data from the current list.
const content = hostContentMap[host] || '-';
return [host, fileName, scene, gallery, movie, performer, needs, content].join(columnSep);
});
};
let results = [];
const mappingPattern = /^([a-z]+)By(Fragment|Name|URL)$/;
const yamlLoadOptions = {
prettyErrors: true,
version: '1.2',
};
for (const file of getScrapers()) {
const relPath = path.relative(process.cwd(), file);
let data;
try {
const contents = fs.readFileSync(file, 'utf8');
data = YAML.parse(contents, yamlLoadOptions);
} catch (error) {
console.error(`\x1b[31mError parsing\x1b[0m ${relPath}:`);
error.stack = null;
console.error(error);
break;
}
/** @type {Scraper} */
const scraper = {
fileName: path.basename(file),
hosts: [],
usesCDP: (data.driver && data.driver.useCDP)
};
Object.entries(data).forEach(([key, value]) => {
const match = mappingPattern.exec(key);
if (!match) {
return;
}
/** @type {'scene'|'performer'|'movie'|'gallery'} */
const objectType = (match[1]);
/** @type {'URL'|'Fragment'|'Name'} */
const scraperMethod = (match[2]);
/**
* @typedef ConfigSlim
* @property {'scrapeXPath'|'scrapeJson'|'script'|'stash'} action
* @property {string | string[]} [url]
* @property {string} [queryURL]
* @property {Object.<string, { regex: string, with: string }[]>} [queryURLReplace]
* @property {string[]} [script]
*/
/** @type {ConfigSlim[]} */
(value instanceof Array ? value : [value]).forEach((config) => {
const { action, url: urls, queryURL, queryURLReplace } = config;
/** @type {Object.<string, boolean>} */
const uses = {};
if (action === 'script' && config.script) {
uses.python = ['python', 'python3'].includes(config.script[0]);
uses.node = config.script[0] === 'node';
}
const queryURLs = parseQueryURL(queryURL, queryURLReplace);
if (urls || queryURLs.length) {
for (const url of (urls || queryURLs)) {
const hosts = makeHosts(url);
for (const curHost of hosts) {
if (/^\$\d/.test(curHost))
continue;
let hostItem = scraper.hosts.find(h => h.host === curHost);
if (!hostItem) {
const newLength = scraper.hosts.push(
/** @type {ScraperHost} */
({ host: curHost })
);
hostItem = scraper.hosts[newLength - 1];
}
hostItem[objectType] = true;
hostItem.usesPython = Boolean(uses.python);
hostItem.usesNode = Boolean(uses.node);
}
}
} else if (action === 'script') {
/** @type {string[]} */
let hosts;
const lookupKey = `${scraper.fileName}|${objectType}`;
if (scriptSites[lookupKey]) {
hosts = scriptSites[lookupKey];
} else {
const hostObj = scraper.hosts.find((h) => h[objectType]);
if (!hostObj) {
console.log(`Skipped (unknown URL): ${scraper.fileName} ${key}`);
return;
}
hosts = [hostObj.host];
}
for (const host of hosts) {
let hostItem = scraper.hosts.find(h => h.host === host);
if (!hostItem) {
const newLength = scraper.hosts.push(
/** @type {ScraperHost} */
({ host })
);
hostItem = scraper.hosts[newLength - 1];
}
hostItem[objectType] = true;
hostItem.usesPython = Boolean(uses.python);
hostItem.usesNode = Boolean(uses.node);
}
} else {
console.log(`TODO: ${scraper.fileName} ${key}`);
}
});
});
results = results.concat(makeTableEntry(scraper));
}
const markdown = `${markdownHeader}
${
tableColumns
.reduce(tableHeaderReducer, [[], []])
.map((arr) => arr.join(columnSep))
.join('\n')
}
${results.sort((a, b) => a.toLowerCase().localeCompare(b.toLowerCase())).join('\n')}
`;
fs.writeFileSync(listPath, markdown, 'utf8');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment