Skip to content

Instantly share code, notes, and snippets.

@parkj90
Last active May 25, 2017 19:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save parkj90/b8d878abd4091efc63f9fe99cbb36ac0 to your computer and use it in GitHub Desktop.
Save parkj90/b8d878abd4091efc63f9fe99cbb36ac0 to your computer and use it in GitHub Desktop.
wikipedia challenge
import sys
import requests
from lxml import html
import time
if len(sys.argv) < 2:
print("Usage: {} URL [page jump limit]".format(sys.argv[0]))
exit()
link = sys.argv[1]
limit = int(sys.argv[2]) if len(sys.argv) > 2 else 25
#input: URL, return: root tree element
def get_page_tree(page):
return html.fromstring(page.content)
#input: root tree, return: title of article
def get_title(tree):
return tree.xpath('//*[@id="firstHeading"]')[0].text
#input: root tree element, return: next URL in chain
def get_first_link(tree):
parenthesized = 0
for p in tree.xpath('//*[@id="mw-content-text"]/p'):
for e in p:
if e.tag == 'a' and not parenthesized:
return 'https://en.wikipedia.org' + e.attrib['href']
raw = str(html.tostring(e))
for c in raw:
if c == '(':
parenthesized += 1
if c == ')':
parenthesized -= 1
for step in range(limit):
page = requests.get(link)
tree = get_page_tree(page)
title = get_title(tree)
print("{}\n#{}: {}".format(link,step+1, title))
if title == 'Philosophy':
break
link = get_first_link(tree)
time.sleep(0.1)
@vijayanandrp
Copy link

vijayanandrp commented May 25, 2017

Line no: 27 'https://en.wikipedia.org' + str(e.attrib['href']).replace('(', '%28').replace(')', '%29')

To solve parenthesize try this method

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment