Created
February 11, 2013 15:16
-
-
Save Hydriz/4755036 to your computer and use it in GitHub Desktop.
A script to archive the static HTML dumps that Wikimedia generated years back. Run using python worker.py and at own risk.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# Copyright (C) 2012 Hydriz | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# GNU General Public License for more details. | |
# | |
# You should have received a copy of the GNU General Public License | |
# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
import os | |
import re | |
import sys | |
import urllib | |
# Configuration | |
# S3 uploading keys (Get one at http://archive.org/account/s3.php) | |
accesskey = "" | |
secretkey = "" | |
# Temporary directory on your local machine to store the dumps | |
tempdir = "" | |
# Desired rsync host to get the dumps | |
rsynchost = "ftpmirror.your.org::wikimedia-dumps/other/static_html_dumps" | |
httphost = "http://dumps.wikimedia.your.org/other/static_html_dumps" | |
dumpdate = "" # Directory (dump date) containing all the wiki html dumps. Replace spaces with underscores. | |
itemdate = "" # Numerical date (e.g. 200709) | |
titledate = "" # Spelt out date (e.g. September 2007) | |
# Archive.org matters | |
collection = "wikimedia-other" | |
mediatype = "web" | |
sizehint = "32212254720" # A nice size hint (in bytes) so every PUT will not fail because of space issues (default 30GB) | |
# Nothing to change below... | |
# Important global messages | |
files = { | |
'html.lst', | |
'images.lst', | |
'skins.lst', | |
'-html.7z', # Suffix of full filename (i.e. wikipedia-aa-html.7z) | |
'-html.tar.7z', # Some dumps have this naming | |
} | |
langlist = "" | |
dumpdate2 = "" | |
dumpdate3 = "" | |
nowlang = "" | |
curwiki = "" | |
count = 0 | |
def welcome(): | |
print "Welcome to the static HTML dumps archiving tool!" | |
def bye(): | |
print "Done, bye!" | |
def help(): | |
print "This is a script to archive the static HTML dumps from Wikimedia" | |
print "Usage: python " + inspect.getfile(inspect.currentframe()) | |
def scanDir(): | |
global langlist | |
directory = urllib.urlopen(httphost + "/" + dumpdate + "/") | |
raw = directory.read() | |
directory.close() | |
langs = re.compile(r'<a href="(?P<lang>[^>]+)/">[^<]+</a>').finditer(raw) | |
langlisting = [] | |
for lang in langs: | |
langlisting.append([lang.group('lang')]) | |
langlist = langlisting | |
def dldFiles(curlang): | |
os.chdir(tempdir) | |
os.system("rsync -av " + rsynchost + "/" + dumpdate + "/" + curlang + "/ " + curlang) | |
def onebyone(): | |
for wikilang in langlist: | |
wikielang = '\n'.join(wikilang) | |
if (wikielang == ".."): | |
continue | |
else: | |
dldFiles(wikielang) | |
uploadFiles(wikielang) | |
def uploadFiles(curlang): | |
global files, nowlang, count | |
os.chdir(tempdir + "/" + curlang) | |
nowlang = curlang | |
for afile in files: | |
makeSpace() | |
rmConnectors() | |
addSuffix() | |
if (count == 0): | |
makebucket(afile) | |
os.system("sleep 60") | |
count += 1 | |
else: | |
if (afile == "-html.7z"): | |
filename = "wikipedia-" + curlang + afile | |
generatecurl(filename) | |
count += 1 | |
elif (afile == "-html.tar.7z"): | |
filename = "wikipedia-" + curlang + afile | |
generatecurl(filename) | |
count += 1 | |
else: | |
generatecurl(afile) | |
count += 1 | |
os.chdir(tempdir) | |
os.system("rm -rf " + curlang) | |
count = 0 | |
def makebucket(whatfile): | |
global dumpdate2, dumpdate3, curwiki | |
curl = ['curl', '--retry 20', '--location', | |
'--header', "'x-amz-auto-make-bucket:1'", | |
'--header', "'x-archive-meta01-collection:%s'" % (collection), | |
'--header', "'x-archive-meta-mediatype:%s'" % (mediatype), | |
'--header', "'x-archive-queue-derive:0'", | |
'--header', "'x-archive-size-hint:%s'" % (sizehint), | |
'--header', "'x-archive-meta-title:Wikimedia static HTML dump of %s on %s'" % (curwiki,dumpdate2), | |
'--header', "'x-archive-meta-description:This is the static HTML dump of %s on %s made by the Wikimedia Foundation and available for <a href='http://dumps.wikimedia.org/other/static_html_dumps/%s/%s/'>download</a> from the Wikimedia website.'" % (curwiki,dumpdate2,dumpdate,nowlang), | |
'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey), | |
'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile), | |
] | |
os.system(' '.join(curl)) | |
def generatecurl(whatfile): | |
global dumpdate2, dumpdate3, curwiki | |
curl = ['curl', '--retry 20', '--location', | |
'--header', "'x-archive-queue-derive:0'", | |
'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey), | |
'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile), | |
] | |
os.system(' '.join(curl)) | |
def makeSpace(): | |
global titledate, dumpdate2 | |
dumpdate2 = titledate | |
def rmConnectors(): | |
global itemdate, dumpdate3 | |
dumpdate3 = itemdate | |
def addSuffix(): | |
global curwiki, nowlang | |
curwiki = nowlang + "wiki" | |
def runProcess(): | |
welcome() | |
scanDir() | |
onebyone() | |
bye() | |
runProcess() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment