Created
February 1, 2023 05:28
-
-
Save Stovoy/8648a635d6a336dfa93ca47754c1946e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import aiohttp | |
from bs4 import BeautifulSoup | |
base_url = "https://control.fandom.com" | |
all_collectibles = "/wiki/Collectibles/List_of_Collectibles" | |
def stats(text): | |
return f'{len(text)} characters, {len(text.split(" "))} words' | |
async def download_file_text(session, url): | |
async with session.get(url) as response: | |
print(f"Getting page {url}...") | |
html = await response.text() | |
parsed = BeautifulSoup(html, features="html.parser") | |
poem = parsed.find("div", {"class": "poem"}) | |
if poem is None: | |
print(f'[Error] No poem found for {url}') | |
return '' | |
text = poem.find('p').get_text() | |
if not text: | |
print(f'[Error] No text found for {url}') | |
else: | |
print(f'{url}: {stats(text)}') | |
return text | |
async def download_all_file_text(): | |
async with aiohttp.ClientSession() as session: | |
url = f"{base_url}{all_collectibles}" | |
async with session.get(url) as response: | |
print(f"Getting page {url}...") | |
html = await response.text() | |
parsed = BeautifulSoup(html, features="html.parser") | |
files = parsed.find("div", {"class": "mw-parser-output"}).find_all("a", recursive=True) | |
futures = [] | |
text = "" | |
skip = ["#", "The_Foundation", "AWE_(expansion)", "/wiki/Control"] | |
for file in files: | |
if any(s in file["href"] for s in skip): | |
continue | |
futures.append(download_file_text(session, f'{base_url}{file["href"]}')) | |
if len(futures) == 32: | |
text += ", ".join(await asyncio.gather(*futures)) | |
futures = [] | |
if len(futures) > 0: | |
text += ", ".join(await asyncio.gather(*futures)) | |
return text | |
async def main(): | |
text = await download_all_file_text() | |
print(f'All collectibles: {stats(text)}') | |
if __name__ == "__main__": | |
asyncio.run(main()) | |
# The answer? All collectibles: 346566 characters, 55008 words. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment