Created
June 3, 2017 05:29
Star
You must be signed in to star a gist
回答 - 言語処理100本ノック 2015 - 第3章
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
20. JSONデータの読み込み | |
Wikipedia記事のJSONファイルを読み込み,「イギリス」に関する記事本文を表示せよ.問題21-29では,ここで抽出した記事本文に対して実行せよ. | |
""" | |
import codecs | |
import json | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if u"イギリス" == article["title"]: | |
print(article["text"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
21. カテゴリ名を含む行を抽出 | |
記事中でカテゴリ名を宣言している行を抽出せよ. | |
""" | |
import codecs | |
import json | |
import re | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
text = extract_text(u"イギリス") | |
for line in text.split("\n"): | |
if re.search(r"Category:", line): | |
print(line) | |
# => | |
# [[Category:イギリス|*]] | |
# [[Category:英連邦王国|*]] | |
# [[Category:G8加盟国]] | |
# [[Category:欧州連合加盟国]] | |
# [[Category:海洋国家]] | |
# [[Category:君主国]] | |
# [[Category:島国|くれいとふりてん]] | |
# [[Category:1801年に設立された州・地域]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
22. カテゴリ名の抽出 | |
記事のカテゴリ名を(行単位ではなく名前で)抽出せよ. | |
""" | |
import codecs | |
import json | |
import re | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
text = extract_text(u"イギリス") | |
for line in text.split("\n"): | |
m = re.search(r"Category:(?P<category>.+?)(\||])", line) | |
if m: | |
print(m.group("category")) | |
# => | |
# イギリス | |
# 英連邦王国 | |
# G8加盟国 | |
# 欧州連合加盟国 | |
# 海洋国家 | |
# 君主国 | |
# 島国 | |
# 1801年に設立された州・地域 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
23. セクション構造 | |
記事中に含まれるセクション名とそのレベル(例えば"== セクション名 =="なら1)を表示せよ. | |
""" | |
import codecs | |
import json | |
import re | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
text = extract_text(u"イギリス") | |
for line in text.split("\n"): | |
m = re.search(r"^(?P<level>=+)(?P<header>.+)\1$", line) | |
if m: | |
header = m.group("header") | |
level = m.group("level").count("=") - 1 | |
print("{0}: {1}".format(level, header)) | |
# => | |
# 1: 国名 | |
# 1: 歴史 | |
# 1: 地理 | |
# 2: 気候 | |
# ... | |
# 1: 外部リンク |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
24. ファイル参照の抽出 | |
記事から参照されているメディアファイルをすべて抜き出せ. | |
""" | |
import codecs | |
import json | |
import re | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
text = extract_text(u"イギリス") | |
for line in text.split("\n"): | |
m = re.search("ファイル:(?P<filename>[^|]+)\|", line) | |
if m: | |
print(m.group("filename")) | |
# => | |
# Royal Coat of Arms of the United Kingdom.svg | |
# CHANDOS3.jpg | |
# The Fabs.JPG | |
# PalaceOfWestminsterAtNight.jpg | |
# ... | |
# Wembley Stadium, illuminated.jpg |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
25. テンプレートの抽出 | |
記事中に含まれる「基礎情報」テンプレートのフィールド名と値を抽出し,辞書オブジェクトとして格納せよ. | |
""" | |
import codecs | |
import json | |
import re | |
from pprint import pprint | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
def extract_base_info(text): | |
m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL) | |
if not m: | |
return {} | |
info_body = m.group("info_body") | |
info_dict = {} | |
for item in info_body.split("\n|"): | |
key, word = re.split(r"\s*=\s*", item, maxsplit=1) | |
info_dict[key] = word | |
return info_dict | |
text = extract_text(u"イギリス") | |
base_info = extract_base_info(text) | |
pprint(base_info, indent=4) | |
# => | |
# { | |
# '公式国名': '{{lang|en|United Kingdom of Great Britain and Northern Ireland}}<ref>英語以外での...コットランド語)</ref>', | |
# '国旗画像': 'Flag of the United Kingdom.svg', | |
# '日本語国名': 'グレー トブリテン及び北アイルランド連合王国', | |
# '国章リンク': '([[イギリスの国章|国章]])', | |
# ... | |
# '首都': '[[ロンドン]]' | |
# } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
26. 強調マークアップの除去 | |
25の処理時に,テンプレートの値からMediaWikiの強調マークアップ(弱い強調,強調,強い強調のすべて)を除去してテキストに変換せよ(参考: マークアップ早見表). | |
""" | |
import codecs | |
import json | |
import re | |
from pprint import pprint | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
def extract_base_info(text): | |
m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL) | |
if not m: | |
return {} | |
info_body = m.group("info_body") | |
info_dict = {} | |
for item in info_body.split("\n|"): | |
key, word = re.split(r"\s*=\s*", item, maxsplit=1) | |
info_dict[key] = word | |
return info_dict | |
def remove_emphasis(text): | |
"""強調マークアップを除去""" | |
return re.sub(r"'{2,}", "", text) | |
text = extract_text(u"イギリス") | |
base_info = extract_base_info(text) | |
sanitized_base_info = {} | |
for k, v in base_info.items(): | |
v = remove_emphasis(v) | |
sanitized_base_info[k] = v | |
pprint(sanitized_base_info, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
27. 内部リンクの除去 | |
26の処理に加えて,テンプレートの値からMediaWikiの内部リンクマークアップを除去し,テキストに変換せよ(参考: マークアップ早見表). | |
""" | |
import codecs | |
import json | |
import re | |
from pprint import pprint | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
def extract_base_info(text): | |
m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL) | |
if not m: | |
return {} | |
info_body = m.group("info_body") | |
info_dict = {} | |
for item in info_body.split("\n|"): | |
key, word = re.split(r"\s+=\s+", item, maxsplit=1) | |
info_dict[key = word | |
return info_dict | |
def remove_emphasis(text): | |
"""強調マークアップを除去""" | |
return re.sub(r"'{2,}", "", text) | |
def remove_internal_links(text): | |
"""内部リンクのマークアップを除去""" | |
return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("|")[-1], text) | |
text = extract_text(u"イギリス") | |
base_info = extract_base_info(text) | |
sanitized_base_info = {} | |
for k, v in base_info.items(): | |
v = remove_emphasis(v) | |
v = remove_internal_links(v) | |
sanitized_base_info[k] = v | |
pprint(sanitized_base_info, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
28. MediaWikiマークアップの除去 | |
27の処理に加えて,テンプレートの値からMediaWikiマークアップを可能な限り除去し,国の基本情報を整形せよ. | |
""" | |
import codecs | |
import json | |
import re | |
from pprint import pprint | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
def extract_base_info(text): | |
m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL) | |
if not m: | |
return {} | |
info_body = m.group("info_body") | |
info_dict = {} | |
for item in info_body.split("\n|"): | |
key, word = re.split(r"\s+=\s+", item, maxsplit=1) | |
word = remove_section_header(word) | |
word = remove_emphasis(word) | |
word = remove_category_links(word) | |
word = remove_internal_links(word) | |
word = remove_external_links(word) | |
word = remove_template(word) | |
word = remove_unordered_list(word) | |
word = remove_define_list(word) | |
word = remove_redirect(word) | |
word = remove_comment(word) | |
info_dict[key] = word | |
return info_dict | |
def remove_section_header(text): | |
"""見出しのマークアップを除去""" | |
return re.sub(r"(=+)(.+?)\1", lambda m: m.group(2), text) | |
def remove_emphasis(text): | |
"""強調マークアップを除去""" | |
return re.sub(r"'{2,}", "", text) | |
def remove_category_links(text): | |
"""カテゴリリンクのマークアップを除去""" | |
return re.sub(r"\[\[Category:(.+?)\]\]", lambda m: m.group(1).split("|")[0], text) | |
def remove_internal_links(text): | |
"""内部リンクのマークアップを除去""" | |
return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("|")[-1], text) | |
def remove_external_links(text): | |
"""外部リンクのマークアップを除去""" | |
return re.sub(r"\[([^]]+)\]", lambda m: m.group(1).split(" ")[-1], text) | |
def remove_template(text): | |
"""スタブのマークアップを除去""" | |
return re.sub(r"\{\{(.+?)\}\}", lambda m: m.group(1).split("|")[-1], text) | |
def remove_unordered_list(text): | |
"""箇条書きのマークアップを除去""" | |
return re.sub(r"^\*+\s*", "", text, flags=re.MULTILINE) | |
def remove_ordered_list(text): | |
"""番号付箇条書きのマークアップを除去""" | |
return re.sub(r"^#+\s*", "", text, flags=re.MULTILINE) | |
def remove_define_list(text): | |
"""定義の箇条書きのマークアップを除去""" | |
return re.sub(r"^(:|;)\s*", "", text, flags=re.MULTILINE) | |
def remove_redirect(text): | |
"""リダイレクトのマークアップを除去""" | |
return re.sub(r"#REDIRECT \[\[(.+?)\]\]", lambda m: m.group(1), text) | |
def remove_comment(text): | |
"""コメントアウトのマークアップを除去""" | |
return re.sub(r"<!--.*?-->", "", text) | |
text = extract_text(u"イギリス") | |
base_info = extract_base_info(text) | |
pprint(base_info, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
29. 国旗画像のURLを取得する | |
テンプレートの内容を利用し,国旗画像のURLを取得せよ.(ヒント: MediaWiki APIのimageinfoを呼び出して,ファイル参照をURLに変換すればよい) | |
""" | |
import codecs | |
import json | |
import re | |
from pprint import pprint | |
def extract_text(title): | |
for row in codecs.open("./src/jawiki-country.json", "r", "utf-8"): | |
article = json.loads(row) | |
if title == article["title"]: | |
return article["text"] | |
def extract_base_info(text): | |
m = re.search("{{基礎情報[^|]+\|(?P<info_body>.+?)\n}}", text, re.DOTALL) | |
if not m: | |
return {} | |
info_body = m.group("info_body") | |
info_dict = {} | |
for item in info_body.split("\n|"): | |
[key, word] = re.split(r"\s+=\s+", item, maxsplit=1) | |
word = remove_section_header(word) | |
word = remove_emphasis(word) | |
word = remove_category_links(word) | |
word = remove_internal_links(word) | |
word = remove_external_links(word) | |
word = remove_template(word) | |
word = remove_unordered_list(word) | |
word = remove_define_list(word) | |
word = remove_redirect(word) | |
word = remove_comment(word) | |
info_dict[key] = word | |
return info_dict | |
def remove_section_header(text): | |
"""見出しのマークアップを除去""" | |
return re.sub(r"(=+)(.+?)\1", lambda m: m.group(2), text) | |
def remove_emphasis(text): | |
"""強調マークアップを除去""" | |
return re.sub(r"'{2,}", "", text) | |
def remove_category_links(text): | |
"""カテゴリリンクのマークアップを除去""" | |
return re.sub(r"\[\[Category:(.+?)\]\]", lambda m: m.group(1).split("|")[0], text) | |
def remove_internal_links(text): | |
"""内部リンクのマークアップを除去""" | |
return re.sub(r"\[\[([^]]+)\]\]", lambda m: m.group(1).split("|")[-1], text) | |
def remove_external_links(text): | |
"""外部リンクのマークアップを除去""" | |
return re.sub(r"\[([^]]+)\]", lambda m: m.group(1).split(" ")[-1], text) | |
def remove_template(text): | |
"""スタブのマークアップを除去""" | |
return re.sub(r"\{\{(.+?)\}\}", lambda m: m.group(1).split("|")[-1], text) | |
def remove_unordered_list(text): | |
"""箇条書きのマークアップを除去""" | |
return re.sub(r"^\*+\s*", "", text, flags=re.MULTILINE) | |
def remove_ordered_list(text): | |
"""番号付箇条書きのマークアップを除去""" | |
return re.sub(r"^#+\s*", "", text, flags=re.MULTILINE) | |
def remove_define_list(text): | |
"""定義の箇条書きのマークアップを除去""" | |
return re.sub(r"^(:|;)\s*", "", text, flags=re.MULTILINE) | |
def remove_redirect(text): | |
"""リダイレクトのマークアップを除去""" | |
return re.sub(r"#REDIRECT \[\[(.+?)\]\]", lambda m: m.group(1), text) | |
def remove_comment(text): | |
"""コメントアウトのマークアップを除去""" | |
return re.sub(r"<!--.*?-->", "", text) | |
text = extract_text(u"イギリス") | |
base_info = extract_base_info(text) | |
from urllib.parse import urlencode | |
from urllib import request | |
flag_image_name = base_info["国旗画像"] | |
query = urlencode({ | |
"action": "query", | |
"titles": "File:{0}".format(flag_image_name), | |
"prop": "imageinfo", | |
"iiprop": "url", | |
"format": "json", | |
}) | |
url = "https://commons.wikimedia.org/w/api.php?{0}".format(query) | |
with request.urlopen(url) as response: | |
body = response.read() | |
data = json.loads(body.decode("utf-8")) | |
pprint(data, indent=4) | |
# => | |
# { | |
# 'continue': {'continue': '||', 'iistart': '2007-09-03T09:51:34Z'}, | |
# 'query': { | |
# 'pages': { | |
# '347935': { | |
# 'imageinfo': [{ | |
# 'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=347935', | |
# 'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Flag_of_the_United_Kingdom.svg', | |
# 'url': 'https://upload.wikimedia.org/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg' | |
# }], | |
# 'imagerepository': 'local', | |
# 'ns': 6, | |
# 'pageid': 347935, | |
# 'title': 'File:Flag of the United ''Kingdom.svg' | |
# } | |
# } | |
# } | |
# } | |
flag_image_url = list(data["query"]["pages"].values())[0]["imageinfo"][0]["url"] | |
print(flag_image_url) | |
# => https://upload.wikimedia.org/wikipedia/commons/a/ae/Flag_of_the_United_Kingdom.svg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment