Skip to content

Instantly share code, notes, and snippets.

@hiropppe
Last active February 12, 2018 14:15
Show Gist options
  • Save hiropppe/df92dc00d72c7fd11f3c3f0131c5528a to your computer and use it in GitHub Desktop.
Save hiropppe/df92dc00d72c7fd11f3c3f0131c5528a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding:utf8 -*-
from __future__ import unicode_literals
import codecs
import gzip
import re
import sys
import unicodedata
from tqdm import tqdm
chr = unichr
re_parentheses_title = re.compile(r"\(\d+,\d+,'?([^,']+)'?,[^\)]+\)")
re_title_brackets = re.compile('_\([^\)]+\)$')
IGNORE_PREFIX = ('アップロードログ', '削除記録/', '削除依頼/', '検証/', '進行中の荒らし行為/', '井戸端/', 'WP:',
'利用者:', 'User:', 'ウィキプロジェクト', 'PJ:', 'メインページ/', '索引 ', '著作権/', 'Location map/')
IGNORE_SUFFIX = ('.jpg', '.jpeg', '.png', '.gif', '.js', '.css', 'の一覧', '/削除')
IGNORE_SUBSTR = ('Wikipedia:', 'Template:', 'Listes:', '過去ログ:', 'ファイル:', '画像:',
'Section:', '/過去ログ', '/history', '/log', '/sandbox')
IGNORE_PATTERNS = (re.compile(r'^\d+$'),
re.compile(r'^\d+年(?:\d+月)?(?:\d+日)?$'),
re.compile(r'^\d+月(?:\d+日)?$'),
re.compile(r'^\d+日$'),
re.compile(r'^\d+\.\d+.\d+.\d+$'),
re.compile(r'^\w+(?:\:\w+)+$'),
re.compile(r'^[\u3041-\u3097]{,2}$'))
HIRAGANA = set(map(chr, range(12353, 12353+86)))
KATAKANA = set(map(chr, range(12449, 12449+90)))
IGNORES = {'さんが'}
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)
def extract_title_from_sql(path):
with gzip.GzipFile(path) as fd:
return list(re_parentheses_title.findall(fd.read().decode('utf8')))
def normalize(title):
title = unicodedata.normalize('NFKC', title)
title = re_title_brackets.sub('', title)
title = title.replace('_', ' ')
return title
def is_useless_entity(title):
if (title.startswith(IGNORE_PREFIX) or
title.endswith(IGNORE_SUFFIX) or
title in HIRAGANA or
title in KATAKANA or
any(s in title for s in IGNORE_SUBSTR) or
any(r.match(title) for r in IGNORE_PATTERNS)):
return False
else:
return True
def main(page_sql_dump, out):
titles = extract_title_from_sql(page_sql_dump)
for title in tqdm(titles):
title = normalize(title)
if not is_useless_entity(title):
if title not in IGNORES:
IGNORES.add(title)
out.write(u'{:s}\n'.format(title))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--page_sql_dump', '-p', type=str, default=None, required=True,
help='Wikipedia page sql dump file (gz)')
parser.add_argument(
'--out', '-o', type=str, default=None,
help='Output file.'
)
args = parser.parse_args()
if args.out:
out = codecs.open(args.out, mode='w', encoding='utf8')
else:
out = sys.stdout
args = parser.parse_args()
main(args.page_sql_dump, out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment