Skip to content

Instantly share code, notes, and snippets.

@rdenadai
Last active November 16, 2022 17:07
Show Gist options
  • Save rdenadai/b634a09ee5f237251b219294202f672c to your computer and use it in GitHub Desktop.
Save rdenadai/b634a09ee5f237251b219294202f672c to your computer and use it in GitHub Desktop.
Carrega o nome de pessoas (em português) de um site específico, cria uma listagem e salva em arquivo. Nomes masculinos e femininos.
import asyncio
import gc
from concurrent.futures import ProcessPoolExecutor
from enum import Enum
from functools import partial, wraps
from itertools import chain
from multiprocessing import cpu_count
from time import perf_counter
from bs4 import BeautifulSoup as bsoup
from httpx import AsyncClient, Response
class Sex(str, Enum):
M = "nomes-masculinos"
F = "nomes-femininos"
def run_in_executor(_func):
@wraps(_func)
def wrapped(*args, **kwargs):
loop = asyncio.get_event_loop()
func = partial(_func, *args, **kwargs)
return loop.run_in_executor(executor=None, func=func)
return wrapped
async def fetch(url: str) -> Response:
async with AsyncClient() as client:
return await client.get(url, follow_redirects=True)
@run_in_executor
def save_names(filename: str, names: list[str]):
with open(filename, "w", encoding="utf-8") as h:
h.write("\n".join(names))
def parse_names(response: Response, name_size: int = 7) -> list[str]:
parser = bsoup(response.text, "lxml")
links = parser.find_all("span", class_="list-wide--name full-w")
return [name for link in links if len(name := link.string.strip()) == name_size]
async def get_names(sex: str = Sex.M, name_size: int = 7, pages: int = 150) -> list[str]:
names: list[str] = []
urls = (f"https://www.dicionariodenomesproprios.com.br/{sex}/{i}" for i in range(1, pages + 1))
responses = await asyncio.gather(*[fetch(url) for url in urls])
with ProcessPoolExecutor(max_workers=cpu_count() * 2) as exc:
names = sorted(chain(*exc.map(partial(parse_names, name_size=name_size), responses, chunksize=5)))
return names
async def main():
task_m = get_names(sex=Sex.M, pages=15)
task_f = get_names(sex=Sex.F, pages=15)
response = await asyncio.gather(*[task_m, task_f])
nomes_m, nomes_f = response
save_m = save_names("nomes_masculinos.txt", nomes_m)
save_f = save_names("nomes_femininos.txt", nomes_f)
await asyncio.gather(*[save_m, save_f])
if __name__ == "__main__":
try:
start = perf_counter()
print("Iniciando rotina...")
gc.set_threshold(7_000, 100, 100)
gc.freeze()
asyncio.run(main())
print(f"Tempo total: {perf_counter() - start:.2f} segundos")
except KeyboardInterrupt:
...
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment