Skip to content

Instantly share code, notes, and snippets.

@whym
Created March 12, 2012 11:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save whym/2021350 to your computer and use it in GitHub Desktop.
Save whym/2021350 to your computer and use it in GitHub Desktop.
extract shortest pages (for MediaWiki/Wikimedia)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import fileinput
import sys
import json
import urllib2
import argparse
from datetime import datetime
def format_date(x):
return datetime.strftime(x, '%Y-%m-%dT%H:%M:%SZ')
def parse_date(x):
return datetime.strptime(x, '%Y%m%d%H%M%S')
def format_wikitext(lines):
yield '{|class="wikitable sortable"'
yield '! Page !! Bytes !! Timestamp'
for line in lines:
a = line.strip().split('\t')
ns,page,size,timestamp,oldid = a
yield '|-'
yield '| [[%s]] || %s || [//{{SERVERNAME}}/{{SCRIPTPATH}}/index.php?oldid=%s %s]' % (page, size, oldid, format_date(parse_date(timestamp)))
yield '|}'
def render(text, script):
url = script
data ='format=json&action=parse&text=%s' % (urllib2.quote(text))
while True:
try:
#print >>sys.stderr, 'fetching %s' % url
res = urllib2.urlopen(urllib2.Request(url,
data=data,
headers={'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'})).read()
break
except urllib2.URLError, e:
print >>sys.stderr, e
exit(1)
return json.loads(res.decode('utf-8'))['parse']['text']['*']
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--format', choices=['wikitext','html'],
dest='format', default='wikitext',
help='')
parser.add_argument('-s', '--site', metavar='ADDRESS',
dest='site', type=str, default='ja.wikipedia.org',
help='target wiki name')
parser.add_argument('infile',
nargs='?', type=argparse.FileType('r'), default=sys.stdin)
options = parser.parse_args()
formatted = format_wikitext(options.infile)
if options.format == 'wikitext':
print '<!-- generated: %s -->' % format_date(datetime.now())
for line in formatted:
print line
else:
header = '''
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html lang="ja" dir="ltr" class="client-nojs" xmlns="http://www.w3.org/1999/xhtml">
<base href="//%(site)s" />
<head>
<title>jawiki shortpages</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="Content-Style-Type" content="text/css" />
<link rel="copyright" href="//creativecommons.org/licenses/by-sa/3.0/" />
<link rel="stylesheet" href="//bits.wikimedia.org/%(site)s/load.php?debug=false&amp;lang=ja&amp;modules=site&amp;only=styles&amp;skin=vector&amp;*" type="text/css" media="all" />
<style type="text/css" media="all">a:lang(ar),a:lang(ckb),a:lang(fa),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}a.new,#quickbar a.new{color:#ba0000}
</style>
<script src="//bits.wikimedia.org/%(site)s/load.php?debug=false&amp;lang=ja&amp;modules=startup&amp;only=scripts&amp;skin=vector&amp;*" type="text/javascript"></script>
<script type="text/javascript" src="//toolserver.org/~whym/jquery/jquery-latest.js"></script>
<script type="text/javascript" src="//toolserver.org/~whym/jquery/jquery.tablesorter.js"></script>
<script type="text/javascript">
$(document).ready(function()
{
$(".sortable").tablesorter();
}
);
</script>
</head>
<body class="mediawiki ltr sitedir-ltr ns-0 ns-subject skin-vector action-view">'''
footer = '''
<!-- /footer -->
<script type="text/javascript">if(window.mw){
mw.loader.load(["mediawiki.user","mediawiki.page.ready","mediawiki.legacy.mwsuggest","ext.vector.collapsibleNav","ext.vector.collapsibleTabs","ext.vector.editWarning","ext.vector.simpleSearch"], null, true);
}</script>
</body>
</html>
'''
header = header % {'site': options.site}
script = 'http://%s/w/api.php' % options.site
print header
print '<!-- generated: %s -->' % format_date(datetime.now())
print render("\n".join(formatted), script).encode('utf-8')
print footer
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# extract shortest pages (for MediaWiki/Wikimedia)
import oursql
import os
import argparse
import sys
import csv
import re
from collections import namedtuple
from datetime import datetime, timedelta
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', metavar='FILE',
dest='output', type=lambda x: open(x, 'w'), default=sys.stdout,
help='')
parser.add_argument('-l', '--limit', metavar='N', default=4000,
dest='limit', type=int,
help='maximum number of results')
parser.add_argument('-s', '--size', metavar='N', default=128,
dest='len', type=int,
help='maximum page size')
parser.add_argument('-d', '--db', metavar='DBNAME', required=True,
dest='db', type=str,
help='target wiki name')
parser.add_argument('-P','--ignore-pages', metavar='PATTERN', default=[],
dest='ignorep', action='append',
help='ignore these pages')
parser.add_argument('-C', '--ignore-categories', metavar='PATTERN', default=[],
dest='ignorec', action='append',
help='ignore pages in these categories')
options = parser.parse_args()
options.db = options.db.replace('_','-')
host = options.db + '.rrdb.toolserver.org'
conn = oursql.connect(host = host,
read_default_file=os.path.expanduser('~/.my.cnf'),
db = options.db.replace('-','_'),
charset=None,
use_unicode=False)
cursor = conn.cursor()
condpages = ' AND '.join(['page_title not like "%s"' % x for x in options.ignorep])
condcats = ' AND '.join(['cl_to not like "%s"' % x for x in options.ignorec])
cursor.execute('''
SELECT page_namespace, page_title, page_len, rev_timestamp, rev_id FROM page LEFT JOIN revision
ON page_latest = rev_id
WHERE page_namespace = 0
AND page_is_redirect = 0
AND %(condpages)s
AND page_len < %(len)s
AND NOT EXISTS ( SELECT cl_from FROM categorylinks
WHERE cl_from = page_id
AND NOT (%(condcats)s) )
LIMIT %(limit)s
;
''' % {'condpages': condpages if len(condpages) > 0 else '1',
'condcats': condcats if len(condcats) > 0 else '1',
'len': options.len,
'limit': options.limit})
writer = csv.writer(options.output, delimiter='\t')
for col in list(cursor):
writer.writerow(list(col))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment