Skip to content

Instantly share code, notes, and snippets.

@danmichaelo
Created June 16, 2013 10:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danmichaelo/5791626 to your computer and use it in GitHub Desktop.
Save danmichaelo/5791626 to your computer and use it in GitHub Desktop.
Script som finner artikler som er kategorisert i `Fødsler i*` og/eller `Dødsfall i *`, men ikke kategorisert i andre kategorier enn disse, samt evt. `Personer fra*` og vedlikeholdskategorier.
# encoding=utf-8
from __future__ import unicode_literals
from __future__ import print_function
import os
import pickle
import oursql
import re
db = oursql.connect(db='nowiki_p',
host='nowiki.labsdb',
read_default_file=os.path.expanduser('~/replica.my.cnf'),
charset=None,
use_unicode=False
)
cur = db.cursor()
def get_subcats(cat_name):
print(cat_name)
cur.execute('SELECT page.page_title FROM page, categorylinks WHERE categorylinks.cl_to=? AND categorylinks.cl_type="subcat" AND categorylinks.cl_from=page.page_id', [cat_name.encode('utf-8')])
subcats = [c[0].decode('utf-8') for c in cur.fetchall()]
allcats = [cat_name]
for cat in subcats:
if not cat in ['Mangler_interwiki', 'Kategorier_som_trenger_diffusjon']:
allcats.extend(get_subcats(cat))
return allcats
if os.path.exists('cats.dump'):
mcats = pickle.load(open('cats.dump', 'r'))
print("Read %d cats from file" % len(mcats))
else:
mcats = get_subcats('Wikipedia-vedlikehold')
pickle.dump(mcats, open('cats.dump', 'w'))
print("Wrote %d cats to file" % len(mcats))
if os.path.exists('allpages.dump'):
pages = pickle.load(open('allpages.dump', 'r'))
print('Read %d pages from file' % len(pages))
else:
cur.execute('SELECT cl_from FROM categorylinks WHERE (cl_to LIKE "Fødsler_i_%" OR cl_to LIKE "Dødsfall_i_%") AND cl_type="page" GROUP BY cl_from'.encode('utf-8'))
pages = [p[0] for p in cur.fetchall()]
pickle.dump(pages, open('allpages.dump', 'w'))
print('Wrote %d pages to file' % len(pages))
if os.path.exists('empty.dump'):
empty = pickle.load(open('empty.dump', 'r'))
print('Read %d empties from file' % len(empty))
else:
empty = []
s = ['Personer_fra_', 'Fødsler_i_', 'Dødsfall_i_']
r1 = re.compile(r'(' + '|'.join(s) + ')', flags=re.I)
for page in pages:
cur.execute('SELECT cl_to FROM categorylinks WHERE cl_from=?', [page])
cats = [c[0].decode('utf-8') for c in cur.fetchall()]
othercats = []
for cat in cats:
if r1.match(cat) or cat in mcats:
continue
else:
othercats.append(cat)
print(page, "Other cats", othercats)
if len(othercats) == 0:
print("EMPTY")
empty.append(page)
pickle.dump(empty, open('empty.dump', 'w'))
print('Wrote %d empties to file' % len(empty))
f = open('uncat.txt', 'w')
for page_id in empty:
print(page_id)
cur.execute('SELECT page_namespace, page_title FROM page WHERE page_id=? LIMIT 1', [int(page_id)])
page = cur.fetchall()[0]
page_title = page[1].decode('utf-8').replace('_', ' ')
if not re.match('Dødsfall i ', page_title):
print(page_title)
f.write(('* [[{{ns:%s}}:%s]]\n' % (page[0], page_title)).encode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment