stoph/section_titles.py

## section_titles.py
from progressbar import ProgressBar
import requests
from lxml import html
import os
import tqdm

pbar = ProgressBar()

filename = "mmp.urls"

with tqdm.tqdm(total=os.path.getsize(filename)) as pbar:
  with open(filename, "r") as file:
    for line in file:
      pbar.update(len(line))
      url = line.strip()

      page = requests.get(url)
      root = html.fromstring(page.text)
      tree = root.getroottree()

      result = root.xpath('//html//body//main//div[6]//div[1]//div//*[contains(text(),"Overview")]')

      for r in result:
        print(tree.getpath(r), end="\t")

      sections = root.xpath('//h2[@class="body-h2"]//strong/text()')
      if not sections:
        sections = root.xpath('//h2[@class="body-h2"]//text()')

      print(sections)
	from progressbar import ProgressBar
	import requests
	from lxml import html
	import os
	import tqdm

	pbar = ProgressBar()

	filename = "mmp.urls"

	with tqdm.tqdm(total=os.path.getsize(filename)) as pbar:
	with open(filename, "r") as file:
	for line in file:
	pbar.update(len(line))
	url = line.strip()

	page = requests.get(url)
	root = html.fromstring(page.text)
	tree = root.getroottree()

	result = root.xpath('//html//body//main//div[6]//div[1]//div//*[contains(text(),"Overview")]')

	for r in result:
	print(tree.getpath(r), end="\t")

	sections = root.xpath('//h2[@class="body-h2"]//strong/text()')
	if not sections:
	sections = root.xpath('//h2[@class="body-h2"]//text()')

	print(sections)