Skip to content

Instantly share code, notes, and snippets.

@Hydriz
Created February 11, 2013 15:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Hydriz/4755036 to your computer and use it in GitHub Desktop.
Save Hydriz/4755036 to your computer and use it in GitHub Desktop.
A script to archive the static HTML dumps that Wikimedia generated years back. Run using python worker.py and at own risk.
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2012 Hydriz
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import re
import sys
import urllib
# Configuration
# S3 uploading keys (Get one at http://archive.org/account/s3.php)
accesskey = ""
secretkey = ""
# Temporary directory on your local machine to store the dumps
tempdir = ""
# Desired rsync host to get the dumps
rsynchost = "ftpmirror.your.org::wikimedia-dumps/other/static_html_dumps"
httphost = "http://dumps.wikimedia.your.org/other/static_html_dumps"
dumpdate = "" # Directory (dump date) containing all the wiki html dumps. Replace spaces with underscores.
itemdate = "" # Numerical date (e.g. 200709)
titledate = "" # Spelt out date (e.g. September 2007)
# Archive.org matters
collection = "wikimedia-other"
mediatype = "web"
sizehint = "32212254720" # A nice size hint (in bytes) so every PUT will not fail because of space issues (default 30GB)
# Nothing to change below...
# Important global messages
files = {
'html.lst',
'images.lst',
'skins.lst',
'-html.7z', # Suffix of full filename (i.e. wikipedia-aa-html.7z)
'-html.tar.7z', # Some dumps have this naming
}
langlist = ""
dumpdate2 = ""
dumpdate3 = ""
nowlang = ""
curwiki = ""
count = 0
def welcome():
print "Welcome to the static HTML dumps archiving tool!"
def bye():
print "Done, bye!"
def help():
print "This is a script to archive the static HTML dumps from Wikimedia"
print "Usage: python " + inspect.getfile(inspect.currentframe())
def scanDir():
global langlist
directory = urllib.urlopen(httphost + "/" + dumpdate + "/")
raw = directory.read()
directory.close()
langs = re.compile(r'<a href="(?P<lang>[^>]+)/">[^<]+</a>').finditer(raw)
langlisting = []
for lang in langs:
langlisting.append([lang.group('lang')])
langlist = langlisting
def dldFiles(curlang):
os.chdir(tempdir)
os.system("rsync -av " + rsynchost + "/" + dumpdate + "/" + curlang + "/ " + curlang)
def onebyone():
for wikilang in langlist:
wikielang = '\n'.join(wikilang)
if (wikielang == ".."):
continue
else:
dldFiles(wikielang)
uploadFiles(wikielang)
def uploadFiles(curlang):
global files, nowlang, count
os.chdir(tempdir + "/" + curlang)
nowlang = curlang
for afile in files:
makeSpace()
rmConnectors()
addSuffix()
if (count == 0):
makebucket(afile)
os.system("sleep 60")
count += 1
else:
if (afile == "-html.7z"):
filename = "wikipedia-" + curlang + afile
generatecurl(filename)
count += 1
elif (afile == "-html.tar.7z"):
filename = "wikipedia-" + curlang + afile
generatecurl(filename)
count += 1
else:
generatecurl(afile)
count += 1
os.chdir(tempdir)
os.system("rm -rf " + curlang)
count = 0
def makebucket(whatfile):
global dumpdate2, dumpdate3, curwiki
curl = ['curl', '--retry 20', '--location',
'--header', "'x-amz-auto-make-bucket:1'",
'--header', "'x-archive-meta01-collection:%s'" % (collection),
'--header', "'x-archive-meta-mediatype:%s'" % (mediatype),
'--header', "'x-archive-queue-derive:0'",
'--header', "'x-archive-size-hint:%s'" % (sizehint),
'--header', "'x-archive-meta-title:Wikimedia static HTML dump of %s on %s'" % (curwiki,dumpdate2),
'--header', "'x-archive-meta-description:This is the static HTML dump of %s on %s made by the Wikimedia Foundation and available for <a href='http://dumps.wikimedia.org/other/static_html_dumps/%s/%s/'>download</a> from the Wikimedia website.'" % (curwiki,dumpdate2,dumpdate,nowlang),
'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey),
'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile),
]
os.system(' '.join(curl))
def generatecurl(whatfile):
global dumpdate2, dumpdate3, curwiki
curl = ['curl', '--retry 20', '--location',
'--header', "'x-archive-queue-derive:0'",
'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey),
'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile),
]
os.system(' '.join(curl))
def makeSpace():
global titledate, dumpdate2
dumpdate2 = titledate
def rmConnectors():
global itemdate, dumpdate3
dumpdate3 = itemdate
def addSuffix():
global curwiki, nowlang
curwiki = nowlang + "wiki"
def runProcess():
welcome()
scanDir()
onebyone()
bye()
runProcess()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment