Created
October 24, 2019 01:08
-
-
Save jonbarrow/f42ff7d083272e569e93347bbb5d3322 to your computer and use it in GitHub Desktop.
Scraper for animeultima.eu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const cloudscraper = require('cloudscraper'); // Bypass cloudfare | |
const { JSDOM } = require('jsdom'); // DOM access in Node | |
const async = require('async'); // async utils | |
const animeIdRegex = /anime-id="(\d*)"/; // Regex to find the anime ID for the site | |
const sourceRegex = /file: "(.*)"/g; | |
const iframeRegex = /iframe .* src="(.*)"/; | |
// URL list | |
const URL_BASE = 'https://animeultima.eu'; | |
const SEARCH_URL = `${URL_BASE}/search?search`; | |
const EPISODE_LIST_URL = `${URL_BASE}/api/episodeList?animeId`; | |
// Main function | |
async function scrape(kitsuDetails, episodeNumber=1) { | |
const streams = []; // This will be populated with streams | |
// Movies don't have multiple episodes | |
if (kitsuDetails.attributes.showType === 'movie') { | |
episodeNumber = 1; | |
} | |
// Search endpoint uses a specific version of the titles | |
const {en_us, en, en_jp, jp} = kitsuDetails.attributes.titles; | |
const title = (en_jp || en || jp || en_us || ja_jp); | |
const search = await cloudscraper.get(`${SEARCH_URL}=${title}`); // Get the search page HTML | |
let dom = new JSDOM(search); | |
const searchResults = [...dom.window.document.querySelectorAll('.anime-box')] // Map the HTML elements to objects for easier use later | |
.map(element => ({ | |
title: element.querySelector('a').getAttribute('title'), | |
href: element.querySelector('a').href, | |
//has_dub: (element.querySelectorAll('.anime-meta span.tooltip').length > 1) // 2nd tooltip indicates a dub (no longer used) | |
})); | |
const anime = searchResults.find(anime => anime.title.includes(title)); // Search for the requested anime | |
if (!anime) { // Return nothing if not found | |
return null; | |
} | |
/* | |
NOTE: | |
There is another potential way to scrape this site that does NOT require using the API or search endpoints | |
This site accepts URLs in the form of "https://animeultima.eu/a/SLUG/episode-NUMBER-[sub/dub]" which will | |
redirect to the other URL format of "https://animeultima.eu/a/SLUG_SOME-ID/episode-NUMBER_SOME-ID-[sub/dub]" | |
This means we do NOT need to know what those additional 2 IDs are in order to directly request an episode | |
link, and it also cuts out the need to use the search endpoint to filter series and cuts out the API for | |
episode listing | |
However, the "SLUG" is NOT reliable or standard. It seems to accept Kitsu slugs for some shows and not others. | |
For example it accepts `boku-no-hero-academia-2nd-season` for My Hero Season 2, which is the slug provided by | |
Kitsu. But it does NOT accept `spirited-away`, the slug provided by Kitsu, for Spirited Away. In the case of | |
Spirited Away it expects `sen-to-chihiro-no-kamikakushi` which is the en_jp title converted to lowercase and | |
with spaces replaced by dashes. But even this does not stay reliable, as seen with My Hero Season 4 where the | |
slug from Kitsu is `boku-no-hero-academia-4` and the en_jp slug is the same, but yet it expects the slug to be | |
`boku-no-hero-academia-4th-season` | |
So if you can figure out where this site is getting the slugs from, you can potentially skip all steps needed | |
to find the anime and jump directly to episode pages | |
*/ | |
// The url `${anime.href}/episode-${episodeNumber}-sub` also redirects correctly but throws 404 errors when using CF scrapers | |
// Maybe different request modules better support this? It would remove the need for the API and animeId requests below | |
const animePage = await cloudscraper.get(anime.href); // Request the main anime details page | |
const animeIdData = animeIdRegex.exec(animePage); // Look for the ID | |
// If ID was not found, return nothing | |
if (!animeIdData || !animeIdData[1]) { | |
return null; | |
} | |
// Extract the anime ID | |
const animeId = animeIdData[1]; | |
// Get the episode list and search for the requested episode | |
const episodeList = await cloudscraper.get(`${EPISODE_LIST_URL}=${animeId}`, { json: true }); | |
const { episodes } = episodeList; | |
const episode = episodes.find(({ episode_num }) => Number(episode_num) === episodeNumber); | |
// If episode was not found, return nothing | |
if (!episode) { | |
return null; | |
} | |
const { urls } = episode; | |
const url = urls.sub; // Even if there's multiple URLs (for sub and dub), each URL actually has links to both sub and dub (if exist) | |
const episodePage = await cloudscraper.get(url); | |
dom = new JSDOM(episodePage); | |
// Get a list of embeds | |
const embeds = [...dom.window.document.querySelectorAll('.server-selector .mirror-selector option')] | |
.map(element => { | |
const url = element.value; | |
const serverType = element.innerHTML.toLowerCase().split(': ').pop(); // Server type determines where to make the following request(s) | |
const langAndId = url.split('-').pop(); // Get the episode Id and language values | |
const [lang, episodeId] = langAndId.split('/'); // split the values for use | |
return { | |
dub: lang === 'dub', // check if the episode is dubbed | |
server_type: serverType, // store the server type | |
// If the server type is for the site ("au"engine and "au".ch = "a"nime"u"ltima) then store the embed | |
// "faststream" also seems to be stored on the AU website, but under a different embed location (I noticed this with newer series) | |
// If it matches neither, assume it's an iframe to a different service | |
url: ( | |
(serverType === 'auengine' || serverType === 'au.ch') ? `${URL_BASE}/e/${episodeId}` : | |
(serverType.includes('faststream') ? `${URL_BASE}/faststream/${episodeId}` : url) | |
) | |
}; | |
}); | |
// Loop over all the URLs in parallel | |
return new Promise(resolve => { | |
async.each(embeds, (embed, callback) => { | |
cloudscraper.get(embed.url) | |
.then(html => { | |
// If the server type is AU or faststream then pull the source directly. The source is stored in a JSON object in-line | |
if (embed.server_type === 'auengine' || embed.server_type === 'au.ch' || embed.server_type.includes('faststream')) { | |
const matches = [...html.matchAll(sourceRegex)]; | |
// there can be multiple streams be embed, so loop over all of them (always at least 2, one standard and one HLS) | |
for (const match of matches) { | |
streams.push({ | |
stream: match[1], | |
server: embed.server_type, | |
dub: embed.dub | |
}); | |
} | |
callback(); | |
} else { | |
// Assume it's an iframe if above check fails | |
const iframeMatch = iframeRegex.exec(html); | |
if (!iframeMatch || !iframeMatch[1]) { | |
callback(); | |
} else { | |
const iframeUrl = iframeMatch[1]; | |
// Pass iframeUrl to a specific embed scraper. These URLs are for services like Streamango and mp4upload | |
callback(); | |
} | |
} | |
}); | |
}, () => { | |
return resolve(streams); | |
}); | |
}); | |
} | |
(async () => { | |
console.time('Scrape Time'); | |
const streams = await scrape({ // Fake Kitsu response | |
attributes: { | |
titles: { | |
en_jp: 'Clannad', | |
}, | |
showType: 'TV' | |
} | |
}, 3); | |
console.timeEnd('Scrape Time'); | |
console.log(streams); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Getting this error:
(node:11618) UnhandledPromiseRejectionWarning: CaptchaError: captcha at validateResponse (/Users/alexandresantos/animeflix-addon/node_modules/cloudscraper/index.js:273:11) at onCloudflareResponse (/Users/alexandresantos/animeflix-addon/node_modules/cloudscraper/index.js:222:5) at onRequestResponse (/Users/alexandresantos/animeflix-addon/node_modules/cloudscraper/index.js:205:5) at Request.<anonymous> (/Users/alexandresantos/animeflix-addon/node_modules/cloudscraper/index.js:149:7) at Object.onceWrapper (events.js:300:26) at Request.emit (events.js:210:5) at Request.<anonymous> (/Users/alexandresantos/animeflix-addon/node_modules/request/request.js:1161:10) at Request.emit (events.js:210:5) at Gunzip.<anonymous> (/Users/alexandresantos/animeflix-addon/node_modules/request/request.js:1083:12) at Object.onceWrapper (events.js:299:28) at Gunzip.emit (events.js:215:7) at endReadableNT (_stream_readable.js:1184:12) at processTicksAndRejections (internal/process/task_queues.js:80:21)