Skip to content

Instantly share code, notes, and snippets.

@keddad
Created May 12, 2019 08:58
Show Gist options
  • Save keddad/d816b9c3c0f10ad9de8de35108e26a3d to your computer and use it in GitHub Desktop.
Save keddad/d816b9c3c0f10ad9de8de35108e26a3d to your computer and use it in GitHub Desktop.
def parsedoc(url):
page = urlopen(Request(url, headers={'User-Agent': 'Mozilla'}))
soup = BeautifulSoup(page, features="html.parser")
for div in soup.find_all("div", {'id': "comments"}):
div.decompose()
for footer in soup.find_all("footer", {'id': "footer"}):
footer.decompose()
code = len(soup.find_all('code'))
img = len(soup.find_all('img'))
clean = soup.find('time')
clean2 = soup.find('h1')
headline2 = soup.find_all('h2')
headlines = len(headline2)
time = bleach.clean(str(clean), tags=[], strip=True)
title = bleach.clean(str(clean2), tags=[], strip=True)
return {"title": title,
"time": time,
"headlines": headlines,
"img": img,
"code": code}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment