Skip to content

Instantly share code, notes, and snippets.

@stoph
Last active February 7, 2022 17:07
Show Gist options
  • Save stoph/59c144d09f755ebb64477042be90f5af to your computer and use it in GitHub Desktop.
Save stoph/59c144d09f755ebb64477042be90f5af to your computer and use it in GitHub Desktop.
Extract all section titles from MMPs
from progressbar import ProgressBar
import requests
from lxml import html
import os
import tqdm
pbar = ProgressBar()
filename = "mmp.urls"
with tqdm.tqdm(total=os.path.getsize(filename)) as pbar:
with open(filename, "r") as file:
for line in file:
pbar.update(len(line))
url = line.strip()
page = requests.get(url)
root = html.fromstring(page.text)
tree = root.getroottree()
result = root.xpath('//html//body//main//div[6]//div[1]//div//*[contains(text(),"Overview")]')
for r in result:
print(tree.getpath(r), end="\t")
sections = root.xpath('//h2[@class="body-h2"]//strong/text()')
if not sections:
sections = root.xpath('//h2[@class="body-h2"]//text()')
print(sections)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment