Skip to content

Instantly share code, notes, and snippets.

@epoz
Created October 1, 2012 19:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save epoz/3813746 to your computer and use it in GitHub Desktop.
Save epoz/3813746 to your computer and use it in GitHub Desktop.
Retrieves the full source of a title from Wikisource
#!/usr/bin/env python
# Example: python gimmesrc.py De_Cive > txt
import sys, urllib, urllib2
URL = 'http://en.wikisource.org/w/index.php?action=raw&title='
if __name__ == '__main__':
title = sys.argv[1]
title_parts = []
data = urllib2.urlopen(URL+title).read()
in_contents = False
for line in data.split('\n'):
if line.strip() == '===Contents===':
in_contents = True
continue
if in_contents:
if line == '':
in_contents = False
else:
line = line.split('|')
title_parts.append(line[0].strip('*['))
for part in title_parts:
if not part.startswith('/'):
continue
print part
part = urllib.quote_plus(part.replace(' ', '_'))
data = urllib2.urlopen(URL+title+part).read()
in_contents = False
for line in data.split('\n'):
if line.strip() == '<div class="prose">':
in_contents = True
continue
if line.startswith('[['):
in_contents = False
if in_contents:
print line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment