hiropppe/jawiki_extract_titles.py

## jawiki_extract_titles.py
#!/usr/bin/env python
# -*- coding:utf8 -*-
from __future__ import unicode_literals

import codecs
import gzip
import re
import sys
import unicodedata

from tqdm import tqdm

chr = unichr

re_parentheses_title = re.compile(r"\(\d+,\d+,'?([^,']+)'?,[^\)]+\)")

re_title_brackets = re.compile('_\([^\)]+\)$')


IGNORE_PREFIX = ('アップロードログ', '削除記録/', '削除依頼/', '検証/', '進行中の荒らし行為/', '井戸端/', 'WP:',
                 '利用者:', 'User:', 'ウィキプロジェクト', 'PJ:', 'メインページ/', '索引 ', '著作権/', 'Location map/')
IGNORE_SUFFIX = ('.jpg', '.jpeg', '.png', '.gif', '.js', '.css', 'の一覧', '/削除')
IGNORE_SUBSTR = ('Wikipedia:', 'Template:', 'Listes:', '過去ログ:', 'ファイル:', '画像:',
                 'Section:', '/過去ログ', '/history', '/log', '/sandbox')
IGNORE_PATTERNS = (re.compile(r'^\d+$'),
                   re.compile(r'^\d+年(?:\d+月)?(?:\d+日)?$'),
                   re.compile(r'^\d+月(?:\d+日)?$'),
                   re.compile(r'^\d+日$'),
                   re.compile(r'^\d+\.\d+.\d+.\d+$'),
                   re.compile(r'^\w+(?:\:\w+)+$'),
                   re.compile(r'^[\u3041-\u3097]{,2}$'))

HIRAGANA = set(map(chr, range(12353, 12353+86)))
KATAKANA = set(map(chr, range(12449, 12449+90)))

IGNORES = {'さんが'}

sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)


def extract_title_from_sql(path):
    with gzip.GzipFile(path) as fd:
        return list(re_parentheses_title.findall(fd.read().decode('utf8')))


def normalize(title):
    title = unicodedata.normalize('NFKC', title)
    title = re_title_brackets.sub('', title)
    title = title.replace('_', ' ')
    return title


def is_useless_entity(title):
    if (title.startswith(IGNORE_PREFIX) or
            title.endswith(IGNORE_SUFFIX) or
            title in HIRAGANA or
            title in KATAKANA or
            any(s in title for s in IGNORE_SUBSTR) or
            any(r.match(title) for r in IGNORE_PATTERNS)):
        return False
    else:
        return True


def main(page_sql_dump, out):
    titles = extract_title_from_sql(page_sql_dump)
    for title in tqdm(titles):
        title = normalize(title)
        if not is_useless_entity(title):
            if title not in IGNORES:
                IGNORES.add(title)
                out.write(u'{:s}\n'.format(title))


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--page_sql_dump', '-p', type=str, default=None, required=True,
        help='Wikipedia page sql dump file (gz)')
    parser.add_argument(
        '--out', '-o', type=str, default=None,
        help='Output file.'
    )

    args = parser.parse_args()
    if args.out:
        out = codecs.open(args.out, mode='w', encoding='utf8')
    else:
        out = sys.stdout

    args = parser.parse_args()

    main(args.page_sql_dump, out)
	#!/usr/bin/env python
	# -- coding:utf8 --
	from __future__ import unicode_literals

	import codecs
	import gzip
	import re
	import sys
	import unicodedata

	from tqdm import tqdm

	chr = unichr

	re_parentheses_title = re.compile(r"\(\d+,\d+,'?([^,']+)'?,[^\)]+\)")

	re_title_brackets = re.compile('_\([^\)]+\)$')


	IGNORE_PREFIX = ('アップロードログ', '削除記録/', '削除依頼/', '検証/', '進行中の荒らし行為/', '井戸端/', 'WP:',
	'利用者:', 'User:', 'ウィキプロジェクト', 'PJ:', 'メインページ/', '索引 ', '著作権/', 'Location map/')
	IGNORE_SUFFIX = ('.jpg', '.jpeg', '.png', '.gif', '.js', '.css', 'の一覧', '/削除')
	IGNORE_SUBSTR = ('Wikipedia:', 'Template:', 'Listes:', '過去ログ:', 'ファイル:', '画像:',
	'Section:', '/過去ログ', '/history', '/log', '/sandbox')
	IGNORE_PATTERNS = (re.compile(r'^\d+$'),
	re.compile(r'^\d+年(?:\d+月)?(?:\d+日)?$'),
	re.compile(r'^\d+月(?:\d+日)?$'),
	re.compile(r'^\d+日$'),
	re.compile(r'^\d+\.\d+.\d+.\d+$'),
	re.compile(r'^\w+(?:\:\w+)+$'),
	re.compile(r'^[\u3041-\u3097]{,2}$'))

	HIRAGANA = set(map(chr, range(12353, 12353+86)))
	KATAKANA = set(map(chr, range(12449, 12449+90)))

	IGNORES = {'さんが'}

	sys.stdout = codecs.getwriter('utf8')(sys.stdout)
	sys.stderr = codecs.getwriter('utf8')(sys.stderr)


	def extract_title_from_sql(path):
	with gzip.GzipFile(path) as fd:
	return list(re_parentheses_title.findall(fd.read().decode('utf8')))


	def normalize(title):
	title = unicodedata.normalize('NFKC', title)
	title = re_title_brackets.sub('', title)
	title = title.replace('_', ' ')
	return title


	def is_useless_entity(title):
	if (title.startswith(IGNORE_PREFIX) or
	title.endswith(IGNORE_SUFFIX) or
	title in HIRAGANA or
	title in KATAKANA or
	any(s in title for s in IGNORE_SUBSTR) or
	any(r.match(title) for r in IGNORE_PATTERNS)):
	return False
	else:
	return True


	def main(page_sql_dump, out):
	titles = extract_title_from_sql(page_sql_dump)
	for title in tqdm(titles):
	title = normalize(title)
	if not is_useless_entity(title):
	if title not in IGNORES:
	IGNORES.add(title)
	out.write(u'{:s}\n'.format(title))


	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument(
	'--page_sql_dump', '-p', type=str, default=None, required=True,
	help='Wikipedia page sql dump file (gz)')
	parser.add_argument(
	'--out', '-o', type=str, default=None,
	help='Output file.'
	)

	args = parser.parse_args()
	if args.out:
	out = codecs.open(args.out, mode='w', encoding='utf8')
	else:
	out = sys.stdout

	args = parser.parse_args()

	main(args.page_sql_dump, out)