Skip to content

Instantly share code, notes, and snippets.

@aviraldg
Created August 5, 2012 08:12
Show Gist options
  • Save aviraldg/3262821 to your computer and use it in GitHub Desktop.
Save aviraldg/3262821 to your computer and use it in GitHub Desktop.
Wikipdf
import requests
import bs4
import io
ADD_URI = 'http://en.wikipedia.org/w/index.php?title=Special:Book&bookcmd=add_article&arttitle={0}&oldid=0'
BOOK_URI = 'http://en.wikipedia.org/wiki/Special:Book'
def wikify(topic):
return '+'.join(topic.split(' '))
def main(args):
print '[ Wikipdf ]'
print '(C) 2012 Aviral Dasgupta - www.aviraldg.com'
if len(args) < 2:
print 'Usage: {0} <path.to.page.list>'.format(args[0])
return
with io.open(args[1], 'r') as page_list:
topics = map(lambda t: t.strip(), page_list.readlines())
print 'Downloading: ' + ', '.join(topics) + '.'
print '({0} pages; this may take a while)'.format(len(topics))
s = requests.session()
for topic in topics:
print 'Adding {0}'.format(wikify(topic))
s.get(ADD_URI.format(wikify(topic)))
print 'Done.'
p = s.post(BOOK_URI, {'formatSelect': 'rl', 'bookcmd': 'render'})
print 'Rendering...'
print 'Render progress page is at {0}'.format(p.url)
print 'You can check manually or allow Wikipdf to check and download.'
opt = raw_input('Open web browser with progress page url? [Y/N]')
if opt.lower().strip().startswith('y'):
import webbrowser
webbrowser.open(p.url)
return
t = 1
while True:
import time
time.sleep(t)
print 'Checking ({0}) ...'.format(t)
t += 1
p = s.get(p.url)
if 'finished' in p.text:
break
print 'Rendering done; downloading.'
soup = bs4.BeautifulSoup(p.content)
pdfuri = soup.find_all('a')[3]['href']
content = s.get(pdfuri).content
with io.open(args[1] + '.pdf', 'wb') as out:
out.write(content)
if __name__ == '__main__':
import sys
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment