Last active
August 9, 2021 23:11
-
-
Save peolic/eb9e8d5e565ccd8efad245ee3e1a4469 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
// @ts-check | |
'use strict'; | |
// Execute with working directory @ root of CommunityScrapers | |
// Dependency: https://npmjs.com/package/yaml | |
const fs = require('fs'); | |
const path = require('path'); | |
const _moduleSearchPaths = (mod) => ([ | |
path.resolve(process.cwd(), './validator/node_modules'), | |
...(require.resolve.paths(mod) || []), | |
]); | |
const _yamlModule = require.resolve('yaml', { paths: _moduleSearchPaths('yaml') }); | |
const YAML = require(_yamlModule); | |
// ------------- | |
// Configuration | |
// ------------- | |
/** | |
* Separate Markdown table columns using ` | ` instead of `|`? | |
*/ | |
const addSpacesBetweenColumns = false; | |
/** | |
* Array of prefixes to remove from all hosts. | |
* @type {string[]} | |
*/ | |
const unwantedPrefixes = [ | |
'en', | |
'free', | |
'new', | |
'tour', | |
'www', | |
]; | |
/** | |
* A list of sites that have not-standard URLs in the list. | |
* Example, for source url `www.mypornsite.xxx/scenes/`: | |
* Keys are the generated hosts to override, can be either one of: | |
* (note that some prefixes are always stripped from the hostname, check `unwantedPrefixes` above) | |
* 1. Hostname = `mypornsite.xxx` | |
* 2. Hostname + Pathname = `mypornsite.xxx/scenes/` | |
* Values can be either a string or a list of strings to override the host with. | |
* @type {{ [site: string]: string|string[] }} | |
*/ | |
const siteNameOverride = { | |
'api.metadataapi.net': 'metadataapi.net (JSON API)', | |
'enasianmusume.kin8tengoku.com': 'kin8tengoku.com', | |
'metadataapi.net': 'metadataapi.net (URL)', | |
'mgstage.com': 'www.mgstage.com', | |
'newsensations.com/tour_ns/dvds': 'newsensations.com/tour_ns/', | |
'newsensations.com/tour_ns/updates': 'newsensations.com/tour_ns/', | |
'newsensations.com/tour_rs/': 'newsensations.com/tour_rs/', | |
'purgatoryx.com': 'tour.purgatoryx.com', | |
'trans500.com/tour/': 'trans500.com/tour/', | |
'trans500.com/tour3/': 'trans500.com/tour3/', | |
'trans500.com/tourespanol': 'trans500.com/tourespanol', | |
'wicked.com/en/movie/': 'wicked.com (/movies)', | |
}; | |
/** | |
* @type {{ [lookupKey: string]: string[] }} | |
*/ | |
const scriptSites = { | |
'AdultimeAPI.yml|scene': ['adultime.com'], | |
'IAFD.yml|performer': ['iafd.com'], | |
'JacquieEtMichelTV.yml|scene': ['jacquieetmicheltv.net'], | |
'JavLibrary_python.yml|scene': [], | |
'MindGeekAPI.yml|scene': [], | |
'multiscrape.yml|performer': [], | |
'performer-image-dir.yml|performer': [], | |
'stash-sqlite.yml|performer': [], | |
'torrent.yml|scene': [], | |
'xbvrdb.yml|scene': [], | |
}; | |
// ------------- | |
/** | |
* @typedef ScraperHost | |
* @property {string} host | |
* @property {boolean} [scene] | |
* @property {boolean} [performer] | |
* @property {boolean} [movie] | |
* @property {boolean} [gallery] | |
* @property {boolean} usesPython | |
* @property {boolean} usesNode | |
*/ | |
/** | |
* @typedef Scraper | |
* @property {string} fileName | |
* @property {ScraperHost[]} hosts | |
* @property {boolean} usesCDP | |
*/ | |
const columnSep = addSpacesBetweenColumns ? ' | ' : '|'; | |
const escapeRegex = (/** @type {string} */string) => string.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); | |
const prefixes = unwantedPrefixes.map(escapeRegex).join('|'); | |
const prefixesPattern = new RegExp(String.raw`^(${prefixes})\.`); | |
/** | |
* Make valid hosts for the scrapers list from a partial URL string. | |
* @param {string} url | |
* @returns {string[]} | |
*/ | |
const makeHosts = (url) => { | |
const urlObj = new URL(url.replace(/^(?!https?:)/, 'http://')); | |
const hostname = urlObj.hostname.replace(prefixesPattern, ''); | |
const overrideKeys = [ | |
hostname, | |
hostname + urlObj.pathname, | |
]; | |
let result = undefined; | |
let idx = 0; | |
while (idx < overrideKeys.length && result === undefined) { | |
result = siteNameOverride[overrideKeys[idx]]; | |
idx++; | |
} | |
if (result === undefined) { | |
result = [hostname]; | |
} | |
return result instanceof Array ? result : [result]; | |
}; | |
const queryURLVariablePattern = /(?<=\{)([a-z]+)(?!=\})/g; | |
/** | |
* @param {string} [queryURL] | |
* @param {Object.<string, { regex: string, with: string }[]>} [queryURLReplace] | |
*/ | |
const parseQueryURL = (queryURL, queryURLReplace) => { | |
if (!queryURL) { | |
return []; | |
} | |
if (!queryURLReplace) { | |
return [queryURL]; | |
} | |
const urls = []; | |
queryURL.match(queryURLVariablePattern).forEach((key) => { | |
const keyPattern = new RegExp(String.raw`\{${key}\}`, 'g'); | |
if (!queryURLReplace[key]) return; | |
queryURLReplace[key].forEach(({ regex, with: repl }) => { | |
if (regex.trim() === '$') return; | |
urls.push(queryURL.replace(keyPattern, repl)); | |
}); | |
}); | |
return urls; | |
}; | |
/** | |
* Zip two arrays of equal length. | |
* @param {any[]} a | |
* @param {any[]} b | |
*/ | |
const zip = (a, b) => a.map((k, i) => [k, b[i]]); | |
const listPath = path.resolve(process.cwd(), './SCRAPERS-LIST.md'); | |
/** | |
* @returns {string[]} | |
*/ | |
const getScrapers = () => { | |
const scrapersDir = path.resolve(process.cwd(), './scrapers'); | |
return fs.readdirSync(scrapersDir).reduce( | |
(acc, fname) => (fname.endsWith('.yml') ? acc.concat(path.join(scrapersDir, fname)) : acc), | |
[] | |
); | |
}; | |
let markdownHeader = ''; | |
// Values here are only a fallback, the script grabs the current columns from the list below. | |
/** @type {{align: string, title: string}[]} */ | |
let tableColumns = [ | |
{align: '' , title: 'Supported Site'}, | |
{align: '' , title: 'Scraper'}, | |
{align: '^', title: 'S'}, | |
{align: '^', title: 'G'}, | |
{align: '^', title: 'M'}, | |
{align: '^', title: 'P'}, | |
{align: '^', title: 'Needs'}, | |
{align: '^', title: 'Contents'}, | |
]; | |
const knownNeeds = [ | |
'cdp', | |
// 'python3', | |
// 'node', | |
]; | |
/** | |
* @returns {[Object.<string, string[]>, Object.<string, string>]} | |
*/ | |
const _getCurrentData = () => { | |
/** @type {Object.<string, string[]>} */ | |
const needsMap = {}; | |
/** @type {Object.<string, string>} */ | |
const contentMap = {}; | |
const md = fs.readFileSync(listPath, 'utf8'); | |
const lines = md.split(/\r?\n/g); | |
const tableHeader = lines.findIndex((line) => (line.match(/\|/g) || []).length === (tableColumns.length - 1)); | |
markdownHeader = lines.slice(0, tableHeader).join('\n'); | |
/** | |
* Joined column titles and headers. | |
* @type {[string, string][]} | |
*/ | |
// @ts-ignore | |
const tableColumnHeaders = zip(...lines.slice(tableHeader, tableHeader + 2).map((h) => h.split('|'))); | |
/** @type {{align: string, title: string}[]} */ | |
const currentTableColumns = tableColumnHeaders.map(([title, header]) => { | |
title = title.trim(); | |
header = header.trim(); | |
const left = header.slice(0, 1); | |
const right = header.slice(-1); | |
const align = (left === ':' && right === ':') ? '^' : (right === ':') ? '>' : (left === ':') ? '<' : ''; | |
return { align, title }; | |
}); | |
if (currentTableColumns.length > 0) { | |
tableColumns = currentTableColumns; | |
} | |
lines.slice(tableHeader + 2).forEach((line) => { | |
const [ | |
host, | |
, // fileName, | |
, // scenes, | |
, // gallery, | |
, // movies, | |
, // performers, | |
needs, | |
contents, | |
] = line.split('|'); | |
if (typeof contents === 'string' && contents.trim() !== '-') { | |
contentMap[host.trim()] = contents.trim(); | |
} | |
if (typeof needs === 'string' && needs.trim() !== '-') { | |
needsMap[host.trim()] = needs | |
.trim() | |
.split(', ') | |
.filter((n) => !knownNeeds.includes(n.toLowerCase())); | |
} | |
}); | |
return [needsMap, contentMap]; | |
}; | |
const [hostNeedsMap, hostContentMap] = _getCurrentData(); | |
/** | |
* @param {typeof tableColumns[number]} column | |
* @returns {string} | |
*/ | |
const makeColumnHeader = ({ align, title }) => { | |
if (!align) { | |
return '-'.repeat(Math.max(1, title.length)); | |
} | |
const left = (align === '^' || align === '<') ? ':' : ''; | |
const right = (align === '^' || align === '>') ? ':' : ''; | |
const len = title.length - (left ? 1 : 0) - (right ? 1 : 0); | |
return left + '-'.repeat(Math.max(1, len)) + right; | |
}; | |
/** | |
* @param {string[][]} accumulator | |
* @param {typeof tableColumns[number]} column | |
* @param {number} idx | |
* @returns {string[][]} | |
*/ | |
const tableHeaderReducer = ([titles, headers], column, idx) => { | |
const columnHeader = makeColumnHeader(column); | |
const len = column.title.length; | |
const max = columnHeader.length; | |
if (len >= max) { | |
titles.push(column.title); | |
} else { | |
// Center | |
titles.push( | |
column.title | |
.padStart(len + Math.floor((max - len) / 2), ' ') | |
.padEnd(max, ' ') | |
); | |
} | |
return [titles, headers.concat(columnHeader)]; | |
}; | |
/** | |
* Icons to use | |
* @param {boolean} val | |
*/ | |
const getIcon = (val) => val ? ':heavy_check_mark:' : ':x:'; | |
/** | |
* @param {Scraper} scraper | |
*/ | |
const makeTableEntry = (scraper) => { | |
const { fileName, hosts, usesCDP } = scraper; | |
return hosts.map((hostObj) => { | |
const { host } = hostObj; | |
const scene = getIcon(!!hostObj['scene']); | |
const gallery = getIcon(!!hostObj['gallery']); | |
const movie = getIcon(!!hostObj['movie']); | |
const performer = getIcon(!!hostObj['performer']); | |
/** @type {string[]} */ | |
const needsArray = [].concat(hostNeedsMap[host] || []); | |
// Anything handled here should be added to the `knownNeeds` array. | |
(usesCDP) && needsArray.push('CDP'); | |
// (hostObj.usesPython) && needsArray.push('python3'); | |
// (hostObj.usesNode) && needsArray.push('node'); | |
const needs = (needsArray.length > 0) ? needsArray.join(', ') : '-'; | |
// Content not available in scraper config files, | |
// so use the data from the current list. | |
const content = hostContentMap[host] || '-'; | |
return [host, fileName, scene, gallery, movie, performer, needs, content].join(columnSep); | |
}); | |
}; | |
let results = []; | |
const mappingPattern = /^([a-z]+)By(Fragment|Name|URL)$/; | |
const yamlLoadOptions = { | |
prettyErrors: true, | |
version: '1.2', | |
}; | |
for (const file of getScrapers()) { | |
const relPath = path.relative(process.cwd(), file); | |
let data; | |
try { | |
const contents = fs.readFileSync(file, 'utf8'); | |
data = YAML.parse(contents, yamlLoadOptions); | |
} catch (error) { | |
console.error(`\x1b[31mError parsing\x1b[0m ${relPath}:`); | |
error.stack = null; | |
console.error(error); | |
break; | |
} | |
/** @type {Scraper} */ | |
const scraper = { | |
fileName: path.basename(file), | |
hosts: [], | |
usesCDP: (data.driver && data.driver.useCDP) | |
}; | |
Object.entries(data).forEach(([key, value]) => { | |
const match = mappingPattern.exec(key); | |
if (!match) { | |
return; | |
} | |
/** @type {'scene'|'performer'|'movie'|'gallery'} */ | |
const objectType = (match[1]); | |
/** @type {'URL'|'Fragment'|'Name'} */ | |
const scraperMethod = (match[2]); | |
/** | |
* @typedef ConfigSlim | |
* @property {'scrapeXPath'|'scrapeJson'|'script'|'stash'} action | |
* @property {string | string[]} [url] | |
* @property {string} [queryURL] | |
* @property {Object.<string, { regex: string, with: string }[]>} [queryURLReplace] | |
* @property {string[]} [script] | |
*/ | |
/** @type {ConfigSlim[]} */ | |
(value instanceof Array ? value : [value]).forEach((config) => { | |
const { action, url: urls, queryURL, queryURLReplace } = config; | |
/** @type {Object.<string, boolean>} */ | |
const uses = {}; | |
if (action === 'script' && config.script) { | |
uses.python = ['python', 'python3'].includes(config.script[0]); | |
uses.node = config.script[0] === 'node'; | |
} | |
const queryURLs = parseQueryURL(queryURL, queryURLReplace); | |
if (urls || queryURLs.length) { | |
for (const url of (urls || queryURLs)) { | |
const hosts = makeHosts(url); | |
for (const curHost of hosts) { | |
if (/^\$\d/.test(curHost)) | |
continue; | |
let hostItem = scraper.hosts.find(h => h.host === curHost); | |
if (!hostItem) { | |
const newLength = scraper.hosts.push( | |
/** @type {ScraperHost} */ | |
({ host: curHost }) | |
); | |
hostItem = scraper.hosts[newLength - 1]; | |
} | |
hostItem[objectType] = true; | |
hostItem.usesPython = Boolean(uses.python); | |
hostItem.usesNode = Boolean(uses.node); | |
} | |
} | |
} else if (action === 'script') { | |
/** @type {string[]} */ | |
let hosts; | |
const lookupKey = `${scraper.fileName}|${objectType}`; | |
if (scriptSites[lookupKey]) { | |
hosts = scriptSites[lookupKey]; | |
} else { | |
const hostObj = scraper.hosts.find((h) => h[objectType]); | |
if (!hostObj) { | |
console.log(`Skipped (unknown URL): ${scraper.fileName} ${key}`); | |
return; | |
} | |
hosts = [hostObj.host]; | |
} | |
for (const host of hosts) { | |
let hostItem = scraper.hosts.find(h => h.host === host); | |
if (!hostItem) { | |
const newLength = scraper.hosts.push( | |
/** @type {ScraperHost} */ | |
({ host }) | |
); | |
hostItem = scraper.hosts[newLength - 1]; | |
} | |
hostItem[objectType] = true; | |
hostItem.usesPython = Boolean(uses.python); | |
hostItem.usesNode = Boolean(uses.node); | |
} | |
} else { | |
console.log(`TODO: ${scraper.fileName} ${key}`); | |
} | |
}); | |
}); | |
results = results.concat(makeTableEntry(scraper)); | |
} | |
const markdown = `${markdownHeader} | |
${ | |
tableColumns | |
.reduce(tableHeaderReducer, [[], []]) | |
.map((arr) => arr.join(columnSep)) | |
.join('\n') | |
} | |
${results.sort((a, b) => a.toLowerCase().localeCompare(b.toLowerCase())).join('\n')} | |
`; | |
fs.writeFileSync(listPath, markdown, 'utf8'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment