TheMasteredPanda/tldr_cgs_parser.py

## tldr_cgs_parser.py
import asyncio
import json

import aiohttp
import bs4
from bs4.element import Tag


async def parse_cgs():
    async with aiohttp.ClientSession() as session:
        async with session.get(
            "https://tldrnews.co.uk/discord-community-guidelines/"
        ) as response:
            result = {}
            html = await response.text()
            soup = bs4.BeautifulSoup(html, "html.parser")
            entry = soup.find("div", {"class": "entry-content"})
            cg_list = [*entry.children][15]

            cg: bs4.element.Tag
            i: int

            def walk(step_str: str, cg_id: int, parent: Tag, branch: Tag) -> dict:
                contents = branch.contents
                result = {}

                if len(contents) > 1:
                    step_str = f"{step_str}.{cg_id}"
                    for i, cg_c in enumerate(
                        filter(lambda cg_c: type(cg_c) == Tag, contents[1])
                    ):
                        result = result | walk(step_str, i + 1, branch, cg_c)

                else:
                    step_str = f"{step_str}.{cg_id}"
                    result[step_str] = contents[0]
                return result

            parsed_cg = {}
            for i, cg in enumerate(filter(lambda cg: type(cg) == Tag, cg_list)):
                cg_id = i + 1
                str_cg_id = f"{cg_id}"
                contents = cg.contents
                parsed_cg[str_cg_id] = contents[0]

                if len(contents) > 1:
                    cg_c: Tag
                    j: int
                    for j, cg_c in enumerate(
                        filter(lambda cg_c: type(cg_c) == Tag, contents[1])
                    ):
                        parsed_cg = parsed_cg | walk(str_cg_id, j + 1, cg, cg_c)
            return parsed_cg


async def main():
    result = await parse_cgs()
    print(json.dumps(result, indent=4))


asyncio.run(main())
	import asyncio
	import json

	import aiohttp
	import bs4
	from bs4.element import Tag


	async def parse_cgs():
	async with aiohttp.ClientSession() as session:
	async with session.get(
	"https://tldrnews.co.uk/discord-community-guidelines/"
	) as response:
	result = {}
	html = await response.text()
	soup = bs4.BeautifulSoup(html, "html.parser")
	entry = soup.find("div", {"class": "entry-content"})
	cg_list = [*entry.children][15]

	cg: bs4.element.Tag
	i: int

	def walk(step_str: str, cg_id: int, parent: Tag, branch: Tag) -> dict:
	contents = branch.contents
	result = {}

	if len(contents) > 1:
	step_str = f"{step_str}.{cg_id}"
	for i, cg_c in enumerate(
	filter(lambda cg_c: type(cg_c) == Tag, contents[1])
	):
	result = result \| walk(step_str, i + 1, branch, cg_c)

	else:
	step_str = f"{step_str}.{cg_id}"
	result[step_str] = contents[0]
	return result

	parsed_cg = {}
	for i, cg in enumerate(filter(lambda cg: type(cg) == Tag, cg_list)):
	cg_id = i + 1
	str_cg_id = f"{cg_id}"
	contents = cg.contents
	parsed_cg[str_cg_id] = contents[0]

	if len(contents) > 1:
	cg_c: Tag
	j: int
	for j, cg_c in enumerate(
	filter(lambda cg_c: type(cg_c) == Tag, contents[1])
	):
	parsed_cg = parsed_cg \| walk(str_cg_id, j + 1, cg, cg_c)
	return parsed_cg


	async def main():
	result = await parse_cgs()
	print(json.dumps(result, indent=4))


	asyncio.run(main())