Last active
November 16, 2022 17:07
-
-
Save rdenadai/b634a09ee5f237251b219294202f672c to your computer and use it in GitHub Desktop.
Carrega o nome de pessoas (em português) de um site específico, cria uma listagem e salva em arquivo. Nomes masculinos e femininos.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import gc | |
from concurrent.futures import ProcessPoolExecutor | |
from enum import Enum | |
from functools import partial, wraps | |
from itertools import chain | |
from multiprocessing import cpu_count | |
from time import perf_counter | |
from bs4 import BeautifulSoup as bsoup | |
from httpx import AsyncClient, Response | |
class Sex(str, Enum): | |
M = "nomes-masculinos" | |
F = "nomes-femininos" | |
def run_in_executor(_func): | |
@wraps(_func) | |
def wrapped(*args, **kwargs): | |
loop = asyncio.get_event_loop() | |
func = partial(_func, *args, **kwargs) | |
return loop.run_in_executor(executor=None, func=func) | |
return wrapped | |
async def fetch(url: str) -> Response: | |
async with AsyncClient() as client: | |
return await client.get(url, follow_redirects=True) | |
@run_in_executor | |
def save_names(filename: str, names: list[str]): | |
with open(filename, "w", encoding="utf-8") as h: | |
h.write("\n".join(names)) | |
def parse_names(response: Response, name_size: int = 7) -> list[str]: | |
parser = bsoup(response.text, "lxml") | |
links = parser.find_all("span", class_="list-wide--name full-w") | |
return [name for link in links if len(name := link.string.strip()) == name_size] | |
async def get_names(sex: str = Sex.M, name_size: int = 7, pages: int = 150) -> list[str]: | |
names: list[str] = [] | |
urls = (f"https://www.dicionariodenomesproprios.com.br/{sex}/{i}" for i in range(1, pages + 1)) | |
responses = await asyncio.gather(*[fetch(url) for url in urls]) | |
with ProcessPoolExecutor(max_workers=cpu_count() * 2) as exc: | |
names = sorted(chain(*exc.map(partial(parse_names, name_size=name_size), responses, chunksize=5))) | |
return names | |
async def main(): | |
task_m = get_names(sex=Sex.M, pages=15) | |
task_f = get_names(sex=Sex.F, pages=15) | |
response = await asyncio.gather(*[task_m, task_f]) | |
nomes_m, nomes_f = response | |
save_m = save_names("nomes_masculinos.txt", nomes_m) | |
save_f = save_names("nomes_femininos.txt", nomes_f) | |
await asyncio.gather(*[save_m, save_f]) | |
if __name__ == "__main__": | |
try: | |
start = perf_counter() | |
print("Iniciando rotina...") | |
gc.set_threshold(7_000, 100, 100) | |
gc.freeze() | |
asyncio.run(main()) | |
print(f"Tempo total: {perf_counter() - start:.2f} segundos") | |
except KeyboardInterrupt: | |
... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment