Created
June 16, 2013 10:21
-
-
Save danmichaelo/5791626 to your computer and use it in GitHub Desktop.
Script som finner artikler som er kategorisert i `Fødsler i*` og/eller `Dødsfall i *`, men ikke kategorisert i andre kategorier enn disse, samt evt. `Personer fra*` og vedlikeholdskategorier.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf-8 | |
from __future__ import unicode_literals | |
from __future__ import print_function | |
import os | |
import pickle | |
import oursql | |
import re | |
db = oursql.connect(db='nowiki_p', | |
host='nowiki.labsdb', | |
read_default_file=os.path.expanduser('~/replica.my.cnf'), | |
charset=None, | |
use_unicode=False | |
) | |
cur = db.cursor() | |
def get_subcats(cat_name): | |
print(cat_name) | |
cur.execute('SELECT page.page_title FROM page, categorylinks WHERE categorylinks.cl_to=? AND categorylinks.cl_type="subcat" AND categorylinks.cl_from=page.page_id', [cat_name.encode('utf-8')]) | |
subcats = [c[0].decode('utf-8') for c in cur.fetchall()] | |
allcats = [cat_name] | |
for cat in subcats: | |
if not cat in ['Mangler_interwiki', 'Kategorier_som_trenger_diffusjon']: | |
allcats.extend(get_subcats(cat)) | |
return allcats | |
if os.path.exists('cats.dump'): | |
mcats = pickle.load(open('cats.dump', 'r')) | |
print("Read %d cats from file" % len(mcats)) | |
else: | |
mcats = get_subcats('Wikipedia-vedlikehold') | |
pickle.dump(mcats, open('cats.dump', 'w')) | |
print("Wrote %d cats to file" % len(mcats)) | |
if os.path.exists('allpages.dump'): | |
pages = pickle.load(open('allpages.dump', 'r')) | |
print('Read %d pages from file' % len(pages)) | |
else: | |
cur.execute('SELECT cl_from FROM categorylinks WHERE (cl_to LIKE "Fødsler_i_%" OR cl_to LIKE "Dødsfall_i_%") AND cl_type="page" GROUP BY cl_from'.encode('utf-8')) | |
pages = [p[0] for p in cur.fetchall()] | |
pickle.dump(pages, open('allpages.dump', 'w')) | |
print('Wrote %d pages to file' % len(pages)) | |
if os.path.exists('empty.dump'): | |
empty = pickle.load(open('empty.dump', 'r')) | |
print('Read %d empties from file' % len(empty)) | |
else: | |
empty = [] | |
s = ['Personer_fra_', 'Fødsler_i_', 'Dødsfall_i_'] | |
r1 = re.compile(r'(' + '|'.join(s) + ')', flags=re.I) | |
for page in pages: | |
cur.execute('SELECT cl_to FROM categorylinks WHERE cl_from=?', [page]) | |
cats = [c[0].decode('utf-8') for c in cur.fetchall()] | |
othercats = [] | |
for cat in cats: | |
if r1.match(cat) or cat in mcats: | |
continue | |
else: | |
othercats.append(cat) | |
print(page, "Other cats", othercats) | |
if len(othercats) == 0: | |
print("EMPTY") | |
empty.append(page) | |
pickle.dump(empty, open('empty.dump', 'w')) | |
print('Wrote %d empties to file' % len(empty)) | |
f = open('uncat.txt', 'w') | |
for page_id in empty: | |
print(page_id) | |
cur.execute('SELECT page_namespace, page_title FROM page WHERE page_id=? LIMIT 1', [int(page_id)]) | |
page = cur.fetchall()[0] | |
page_title = page[1].decode('utf-8').replace('_', ' ') | |
if not re.match('Dødsfall i ', page_title): | |
print(page_title) | |
f.write(('* [[{{ns:%s}}:%s]]\n' % (page[0], page_title)).encode('utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment