Skip to content

Instantly share code, notes, and snippets.

@18z
Created April 26, 2020 13:15
Show Gist options
  • Save 18z/31226dbb0a86e28aac4d0f1706dccba3 to your computer and use it in GitHub Desktop.
Save 18z/31226dbb0a86e28aac4d0f1706dccba3 to your computer and use it in GitHub Desktop.
import os
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports"
soup = BeautifulSoup(urlopen(url), features="html.parser")
div = soup.findAll("div", {"id": "PageContent_C006_Col01"})
reports = div[0].findAll("a")
urlhead = "https://www.who.int"
for pdf in reports:
urltail = pdf["href"]
full_url = urlhead + urltail
matched = re.search("2020.*pdf", urltail )
try:
filename = matched.group()
print(filename)
command = 'wget -O pdf/{} {}'.format(filename, full_url)
# print(command)
os.system(command)
except:
print("nono")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment