Skip to content

Instantly share code, notes, and snippets.

@TheMasteredPanda
Created October 22, 2022 10:34
Show Gist options
  • Save TheMasteredPanda/940ce56d76eb3abdce1baf5d0ddc317e to your computer and use it in GitHub Desktop.
Save TheMasteredPanda/940ce56d76eb3abdce1baf5d0ddc317e to your computer and use it in GitHub Desktop.
import asyncio
import json
import aiohttp
import bs4
from bs4.element import Tag
async def parse_cgs():
async with aiohttp.ClientSession() as session:
async with session.get(
"https://tldrnews.co.uk/discord-community-guidelines/"
) as response:
result = {}
html = await response.text()
soup = bs4.BeautifulSoup(html, "html.parser")
entry = soup.find("div", {"class": "entry-content"})
cg_list = [*entry.children][15]
cg: bs4.element.Tag
i: int
def walk(step_str: str, cg_id: int, parent: Tag, branch: Tag) -> dict:
contents = branch.contents
result = {}
if len(contents) > 1:
step_str = f"{step_str}.{cg_id}"
for i, cg_c in enumerate(
filter(lambda cg_c: type(cg_c) == Tag, contents[1])
):
result = result | walk(step_str, i + 1, branch, cg_c)
else:
step_str = f"{step_str}.{cg_id}"
result[step_str] = contents[0]
return result
parsed_cg = {}
for i, cg in enumerate(filter(lambda cg: type(cg) == Tag, cg_list)):
cg_id = i + 1
str_cg_id = f"{cg_id}"
contents = cg.contents
parsed_cg[str_cg_id] = contents[0]
if len(contents) > 1:
cg_c: Tag
j: int
for j, cg_c in enumerate(
filter(lambda cg_c: type(cg_c) == Tag, contents[1])
):
parsed_cg = parsed_cg | walk(str_cg_id, j + 1, cg, cg_c)
return parsed_cg
async def main():
result = await parse_cgs()
print(json.dumps(result, indent=4))
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment