Hydriz/worker.py

## worker.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2012 Hydriz
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import os
import re
import sys
import urllib

# Configuration
# S3 uploading keys (Get one at http://archive.org/account/s3.php)
accesskey = ""
secretkey = ""
# Temporary directory on your local machine to store the dumps
tempdir = ""
# Desired rsync host to get the dumps
rsynchost = "ftpmirror.your.org::wikimedia-dumps/other/static_html_dumps"
httphost = "http://dumps.wikimedia.your.org/other/static_html_dumps"
dumpdate = "" # Directory (dump date) containing all the wiki html dumps. Replace spaces with underscores.
itemdate = "" # Numerical date (e.g. 200709)
titledate = "" # Spelt out date (e.g. September 2007)
# Archive.org matters
collection = "wikimedia-other"
mediatype = "web"
sizehint = "32212254720" # A nice size hint (in bytes) so every PUT will not fail because of space issues (default 30GB)

# Nothing to change below...
# Important global messages
files = {
	'html.lst',
	'images.lst',
	'skins.lst',
	'-html.7z', # Suffix of full filename (i.e. wikipedia-aa-html.7z)
	'-html.tar.7z', # Some dumps have this naming
}
langlist = ""
dumpdate2 = ""
dumpdate3 = ""
nowlang = ""
curwiki = ""
count = 0

def welcome():
	print "Welcome to the static HTML dumps archiving tool!"

def bye():
	print "Done, bye!"

def help():
	print "This is a script to archive the static HTML dumps from Wikimedia"
	print "Usage: python " + inspect.getfile(inspect.currentframe())

def scanDir():
	global langlist
	directory = urllib.urlopen(httphost + "/" + dumpdate + "/")
	raw = directory.read()
	directory.close()
	langs = re.compile(r'<a href="(?P<lang>[^>]+)/">[^<]+</a>').finditer(raw)
	langlisting = []
	for lang in langs:
		langlisting.append([lang.group('lang')])

	langlist = langlisting

def dldFiles(curlang):
	os.chdir(tempdir)
	os.system("rsync -av " + rsynchost + "/" + dumpdate + "/" + curlang + "/ " + curlang)

def onebyone():
	for wikilang in langlist:
		wikielang = '\n'.join(wikilang)
		if (wikielang == ".."):
			continue
		else:
			dldFiles(wikielang)
			uploadFiles(wikielang)

def uploadFiles(curlang):
	global files, nowlang, count
	os.chdir(tempdir + "/" + curlang)
	nowlang = curlang
	for afile in files:
		makeSpace()
		rmConnectors()
		addSuffix()
		if (count == 0):
			makebucket(afile)
			os.system("sleep 60")
			count += 1
		else:
			if (afile == "-html.7z"):
				filename = "wikipedia-" + curlang + afile
				generatecurl(filename)
				count += 1
			elif (afile == "-html.tar.7z"):
				filename = "wikipedia-" + curlang + afile
				generatecurl(filename)
				count += 1
			else:
				generatecurl(afile)
				count += 1
	os.chdir(tempdir)
	os.system("rm -rf " + curlang)
	count = 0

def makebucket(whatfile):
	global dumpdate2, dumpdate3, curwiki
	curl = ['curl', '--retry 20', '--location',
			'--header', "'x-amz-auto-make-bucket:1'",
			'--header', "'x-archive-meta01-collection:%s'" % (collection),
			'--header', "'x-archive-meta-mediatype:%s'" % (mediatype),
			'--header', "'x-archive-queue-derive:0'",
			'--header', "'x-archive-size-hint:%s'" % (sizehint),
			'--header', "'x-archive-meta-title:Wikimedia static HTML dump of %s on %s'" % (curwiki,dumpdate2),
			'--header', "'x-archive-meta-description:This is the static HTML dump of %s on %s made by the Wikimedia Foundation and available for <a href='http://dumps.wikimedia.org/other/static_html_dumps/%s/%s/'>download</a> from the Wikimedia website.'" % (curwiki,dumpdate2,dumpdate,nowlang),
			'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey),
			'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile),
			]
	os.system(' '.join(curl))

def generatecurl(whatfile):
	global dumpdate2, dumpdate3, curwiki
	curl = ['curl', '--retry 20', '--location',
		'--header', "'x-archive-queue-derive:0'",
		'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey),
		'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile),
		]
	os.system(' '.join(curl))

def makeSpace():
	global titledate, dumpdate2
	dumpdate2 = titledate

def rmConnectors():
	global itemdate, dumpdate3
	dumpdate3 = itemdate

def addSuffix():
	global curwiki, nowlang
	curwiki = nowlang + "wiki"

def runProcess():
	welcome()
	scanDir()
	onebyone()
	bye()

runProcess()
	#!/usr/bin/python
	# -- coding: utf-8 --

	# Copyright (C) 2012 Hydriz
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <http://www.gnu.org/licenses/>.

	import os
	import re
	import sys
	import urllib

	# Configuration
	# S3 uploading keys (Get one at http://archive.org/account/s3.php)
	accesskey = ""
	secretkey = ""
	# Temporary directory on your local machine to store the dumps
	tempdir = ""
	# Desired rsync host to get the dumps
	rsynchost = "ftpmirror.your.org::wikimedia-dumps/other/static_html_dumps"
	httphost = "http://dumps.wikimedia.your.org/other/static_html_dumps"
	dumpdate = "" # Directory (dump date) containing all the wiki html dumps. Replace spaces with underscores.
	itemdate = "" # Numerical date (e.g. 200709)
	titledate = "" # Spelt out date (e.g. September 2007)
	# Archive.org matters
	collection = "wikimedia-other"
	mediatype = "web"
	sizehint = "32212254720" # A nice size hint (in bytes) so every PUT will not fail because of space issues (default 30GB)

	# Nothing to change below...
	# Important global messages
	files = {
	'html.lst',
	'images.lst',
	'skins.lst',
	'-html.7z', # Suffix of full filename (i.e. wikipedia-aa-html.7z)
	'-html.tar.7z', # Some dumps have this naming
	}
	langlist = ""
	dumpdate2 = ""
	dumpdate3 = ""
	nowlang = ""
	curwiki = ""
	count = 0

	def welcome():
	print "Welcome to the static HTML dumps archiving tool!"

	def bye():
	print "Done, bye!"

	def help():
	print "This is a script to archive the static HTML dumps from Wikimedia"
	print "Usage: python " + inspect.getfile(inspect.currentframe())

	def scanDir():
	global langlist
	directory = urllib.urlopen(httphost + "/" + dumpdate + "/")
	raw = directory.read()
	directory.close()
	langs = re.compile(r'<a href="(?P<lang>[^>]+)/">[^<]+</a>').finditer(raw)
	langlisting = []
	for lang in langs:
	langlisting.append([lang.group('lang')])

	langlist = langlisting

	def dldFiles(curlang):
	os.chdir(tempdir)
	os.system("rsync -av " + rsynchost + "/" + dumpdate + "/" + curlang + "/ " + curlang)

	def onebyone():
	for wikilang in langlist:
	wikielang = '\n'.join(wikilang)
	if (wikielang == ".."):
	continue
	else:
	dldFiles(wikielang)
	uploadFiles(wikielang)

	def uploadFiles(curlang):
	global files, nowlang, count
	os.chdir(tempdir + "/" + curlang)
	nowlang = curlang
	for afile in files:
	makeSpace()
	rmConnectors()
	addSuffix()
	if (count == 0):
	makebucket(afile)
	os.system("sleep 60")
	count += 1
	else:
	if (afile == "-html.7z"):
	filename = "wikipedia-" + curlang + afile
	generatecurl(filename)
	count += 1
	elif (afile == "-html.tar.7z"):
	filename = "wikipedia-" + curlang + afile
	generatecurl(filename)
	count += 1
	else:
	generatecurl(afile)
	count += 1
	os.chdir(tempdir)
	os.system("rm -rf " + curlang)
	count = 0

	def makebucket(whatfile):
	global dumpdate2, dumpdate3, curwiki
	curl = ['curl', '--retry 20', '--location',
	'--header', "'x-amz-auto-make-bucket:1'",
	'--header', "'x-archive-meta01-collection:%s'" % (collection),
	'--header', "'x-archive-meta-mediatype:%s'" % (mediatype),
	'--header', "'x-archive-queue-derive:0'",
	'--header', "'x-archive-size-hint:%s'" % (sizehint),
	'--header', "'x-archive-meta-title:Wikimedia static HTML dump of %s on %s'" % (curwiki,dumpdate2),
	'--header', "'x-archive-meta-description:This is the static HTML dump of %s on %s made by the Wikimedia Foundation and available for <a href='http://dumps.wikimedia.org/other/static_html_dumps/%s/%s/'>download</a> from the Wikimedia website.'" % (curwiki,dumpdate2,dumpdate,nowlang),
	'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey),
	'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile),
	]
	os.system(' '.join(curl))

	def generatecurl(whatfile):
	global dumpdate2, dumpdate3, curwiki
	curl = ['curl', '--retry 20', '--location',
	'--header', "'x-archive-queue-derive:0'",
	'--header', '"authorization: LOW %s:%s"' % (accesskey,secretkey),
	'--upload-file', "%s http://s3.us.archive.org/html-%s-%s/%s" % (whatfile,curwiki,dumpdate3,whatfile),
	]
	os.system(' '.join(curl))

	def makeSpace():
	global titledate, dumpdate2
	dumpdate2 = titledate

	def rmConnectors():
	global itemdate, dumpdate3
	dumpdate3 = itemdate

	def addSuffix():
	global curwiki, nowlang
	curwiki = nowlang + "wiki"

	def runProcess():
	welcome()
	scanDir()
	onebyone()
	bye()

	runProcess()