Last active
February 7, 2022 17:07
-
-
Save stoph/59c144d09f755ebb64477042be90f5af to your computer and use it in GitHub Desktop.
Extract all section titles from MMPs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from progressbar import ProgressBar | |
import requests | |
from lxml import html | |
import os | |
import tqdm | |
pbar = ProgressBar() | |
filename = "mmp.urls" | |
with tqdm.tqdm(total=os.path.getsize(filename)) as pbar: | |
with open(filename, "r") as file: | |
for line in file: | |
pbar.update(len(line)) | |
url = line.strip() | |
page = requests.get(url) | |
root = html.fromstring(page.text) | |
tree = root.getroottree() | |
result = root.xpath('//html//body//main//div[6]//div[1]//div//*[contains(text(),"Overview")]') | |
for r in result: | |
print(tree.getpath(r), end="\t") | |
sections = root.xpath('//h2[@class="body-h2"]//strong/text()') | |
if not sections: | |
sections = root.xpath('//h2[@class="body-h2"]//text()') | |
print(sections) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment