Skip to content

Instantly share code, notes, and snippets.

@Stovoy
Created February 1, 2023 05:28
Show Gist options
  • Save Stovoy/8648a635d6a336dfa93ca47754c1946e to your computer and use it in GitHub Desktop.
Save Stovoy/8648a635d6a336dfa93ca47754c1946e to your computer and use it in GitHub Desktop.
import asyncio
import aiohttp
from bs4 import BeautifulSoup
base_url = "https://control.fandom.com"
all_collectibles = "/wiki/Collectibles/List_of_Collectibles"
def stats(text):
return f'{len(text)} characters, {len(text.split(" "))} words'
async def download_file_text(session, url):
async with session.get(url) as response:
print(f"Getting page {url}...")
html = await response.text()
parsed = BeautifulSoup(html, features="html.parser")
poem = parsed.find("div", {"class": "poem"})
if poem is None:
print(f'[Error] No poem found for {url}')
return ''
text = poem.find('p').get_text()
if not text:
print(f'[Error] No text found for {url}')
else:
print(f'{url}: {stats(text)}')
return text
async def download_all_file_text():
async with aiohttp.ClientSession() as session:
url = f"{base_url}{all_collectibles}"
async with session.get(url) as response:
print(f"Getting page {url}...")
html = await response.text()
parsed = BeautifulSoup(html, features="html.parser")
files = parsed.find("div", {"class": "mw-parser-output"}).find_all("a", recursive=True)
futures = []
text = ""
skip = ["#", "The_Foundation", "AWE_(expansion)", "/wiki/Control"]
for file in files:
if any(s in file["href"] for s in skip):
continue
futures.append(download_file_text(session, f'{base_url}{file["href"]}'))
if len(futures) == 32:
text += ", ".join(await asyncio.gather(*futures))
futures = []
if len(futures) > 0:
text += ", ".join(await asyncio.gather(*futures))
return text
async def main():
text = await download_all_file_text()
print(f'All collectibles: {stats(text)}')
if __name__ == "__main__":
asyncio.run(main())
# The answer? All collectibles: 346566 characters, 55008 words.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment