Skip to content

Instantly share code, notes, and snippets.

@Granitosaurus
Created January 17, 2024 03:47
Show Gist options
  • Save Granitosaurus/05d39b0167641b6daf57028d6d199620 to your computer and use it in GitHub Desktop.
Save Granitosaurus/05d39b0167641b6daf57028d6d199620 to your computer and use it in GitHub Desktop.
scraper for ufcstats.com
import asyncio
import httpx
from bs4 import BeautifulSoup
async def scrape():
async with httpx.AsyncClient() as client:
# get first page and extract links
resp_first_page = await client.get("http://ufcstats.com/statistics/events/completed?page=all")
soup_first_page = BeautifulSoup(resp_first_page.content, "html.parser")
# use dictionary to track results
results = {
"fighters": [],
"weight_class": [],
"referee": [],
"winner": [],
"loser": [],
"method": [],
}
# extract all page links
fight_links = soup_first_page.find_all("a", {"class": "b-link b-link_style_black"})
# then create coroutines for each requests
tasks = []
for link in fight_links:
tasks.append(client.get(link.get("href")))
print(f'scraping {len(tasks)} fight pages')
# which allows to run them concurrently using asyncio.as_completed or asyncio.gather
for resp_fight in asyncio.as_completed(tasks): # this will run all get request together and execute below for each
resp_fight = await resp_fight
print("scraping: {}", resp_fight.url)
href_page = BeautifulSoup(resp_fight.content, "lxml")
fight_page = href_page.find_all("tr")
onclick = None
for page in fight_page:
onclick = page.get("onclick")
if not onclick:
continue # skip this?
start_index = onclick.find("('") + 2
end_index = onclick.find("')")
extracted_url = onclick[start_index:end_index]
fight_url = await client.get(extracted_url)
fight_soup = BeautifulSoup(fight_url.content, "lxml")
for fighter in fight_soup.find_all("a", {"class": "b-link b-link_style_black"}, limit=2):
results["fighters"].append(fighter.get_text().strip())
for weight in fight_soup.find_all("i", {"class": "b-fight-details__fight-title"}):
results["weight_class"].append(weight.get_text().strip())
for result in fight_soup.find_all("i", {"style": "font-style: normal"}):
results["method"].append(result.get_text().strip())
for ref in fight_soup.find_all("span"):
results["referee"].append(ref.get_text().strip())
return results
print(asyncio.run(scrape()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment