Created
October 1, 2012 19:02
Revisions
-
epoz revised this gist
Oct 1, 2012 . 1 changed file with 3 additions and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,6 @@ #!/usr/bin/env python # Example: python gimmesrc.py De_Cive > txt import sys, urllib, urllib2 URL = 'http://en.wikisource.org/w/index.php?action=raw&title=' @@ -32,4 +34,4 @@ if line.startswith('[['): in_contents = False if in_contents: print line -
epoz created this gist
Oct 1, 2012 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,35 @@ #!/usr/bin/env python import sys, urllib, urllib2 URL = 'http://en.wikisource.org/w/index.php?action=raw&title=' if __name__ == '__main__': title = sys.argv[1] title_parts = [] data = urllib2.urlopen(URL+title).read() in_contents = False for line in data.split('\n'): if line.strip() == '===Contents===': in_contents = True continue if in_contents: if line == '': in_contents = False else: line = line.split('|') title_parts.append(line[0].strip('*[')) for part in title_parts: if not part.startswith('/'): continue print part part = urllib.quote_plus(part.replace(' ', '_')) data = urllib2.urlopen(URL+title+part).read() in_contents = False for line in data.split('\n'): if line.strip() == '<div class="prose">': in_contents = True continue if line.startswith('[['): in_contents = False if in_contents: print line