Skip to content

Instantly share code, notes, and snippets.

@yshalsager
Created May 15, 2020 22:43
Show Gist options
  • Save yshalsager/03958b55db85cdc2b8c5ed1f7efb5d1e to your computer and use it in GitHub Desktop.
Save yshalsager/03958b55db85cdc2b8c5ed1f7efb5d1e to your computer and use it in GitHub Desktop.
princeofwales speeches scraper
from requests import get
from bs4 import BeautifulSoup
site = "https://www.princeofwales.gov.uk"
url = f"{site}/biographies/hrh-prince-wales/speeches?title=&mrfs=All&date_from=&date_to=&page="
for page in range(0, 77): # hardcoded page number
print(page)
speeches = BeautifulSoup(get(f"{url}{page}").content, "html.parser").select("div.views-row > div:nth-child(1) > h2:nth-child(1) > a:nth-child(1)")
for speech in speeches:
print(f"{speech['href'].split('/')[-1]}")
speech_page = BeautifulSoup(get(f"{site}{speech['href']}").content, "html.parser").select_one(".region-content").get_text()
with open(f"{speech['href'].split('/')[-1]}.txt", "w") as out:
out.write(speech_page)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment