nutszebra/100_questions_NLP_021

## 100_questions_NLP_021
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

"""
Wikipediaの記事を以下のフォーマットで書き出したファイルjawiki-country.json.gzがある．
link: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
1行に1記事の情報がJSON形式で格納される
各行には記事名が"title"キーに，記事本文が"text"キーの辞書オブジェクトに格納され，そのオブジェクトがJSON形式で書き出される
ファイル全体はgzipで圧縮される
以下の処理を行うプログラムを作成せよ．
Question 21:
21. カテゴリ名を含む行を抽出
記事中でカテゴリ名を宣言している行を抽出せよ．
"""

import subprocess
import requests
import os
import re
import json

def download(url, dir, params={}):
  dl = requests.get(url, params=params)
  with file(dir, "wb") as f:
    f.write(dl.content)

def gunzip(path):
  cmd = "gunzip " + path
  subprocess.call(cmd, shell=True)

#download http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz","./wikipedia.json.gz") if not os.path.exists("./wikipedia.json") else None
if not os.path.exists("wikipedia.json"):
  gunzip("./wikipedia.json.gz")
content = {}
i=0
with open('wikipedia.json', 'r') as f:
  for line in f:
    content[i]=json.loads(line)
    i+=1

keyword = [u"イギリス", u"英国", u"British"]

result = {}
for article in content:
  for word in keyword:
    if word in content[article]["text"] or word in content[article]["text"]:
      result[article] = content[article]
      break
"""
************************************************************************************
The code until here is the code that was written for question 20, and I am resusing
Link: https://gist.github.com/nutszebra/69ae8e21d576c03581f1
************************************************************************************
"""

#例: [[Category:マレーシア|*]]
answer = {}
for key in result:
  tmp = result[key]["text"].split("\n")
  extract = [line for line in tmp if re.findall(r"\[\[Category:.*\]\]$",line)]
  if len(extract):
    answer[key] = "\n".join(extract)
  else:
    pass
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

	"""
	Wikipediaの記事を以下のフォーマットで書き出したファイルjawiki-country.json.gzがある．
	link: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
	1行に1記事の情報がJSON形式で格納される
	各行には記事名が"title"キーに，記事本文が"text"キーの辞書オブジェクトに格納され，そのオブジェクトがJSON形式で書き出される
	ファイル全体はgzipで圧縮される
	以下の処理を行うプログラムを作成せよ．
	Question 21:
	21. カテゴリ名を含む行を抽出
	記事中でカテゴリ名を宣言している行を抽出せよ．
	"""

	import subprocess
	import requests
	import os
	import re
	import json

	def download(url, dir, params={}):
	dl = requests.get(url, params=params)
	with file(dir, "wb") as f:
	f.write(dl.content)

	def gunzip(path):
	cmd = "gunzip " + path
	subprocess.call(cmd, shell=True)

	#download http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
	download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz","./wikipedia.json.gz") if not os.path.exists("./wikipedia.json") else None
	if not os.path.exists("wikipedia.json"):
	gunzip("./wikipedia.json.gz")
	content = {}
	i=0
	with open('wikipedia.json', 'r') as f:
	for line in f:
	content[i]=json.loads(line)
	i+=1

	keyword = [u"イギリス", u"英国", u"British"]

	result = {}
	for article in content:
	for word in keyword:
	if word in content[article]["text"] or word in content[article]["text"]:
	result[article] = content[article]
	break
	"""
	************************************************************************************
	The code until here is the code that was written for question 20, and I am resusing
	Link: https://gist.github.com/nutszebra/69ae8e21d576c03581f1
	************************************************************************************
	"""

	#例: [[Category:マレーシア\|*]]
	answer = {}
	for key in result:
	tmp = result[key]["text"].split("\n")
	extract = [line for line in tmp if re.findall(r"\[\[Category:.*\]\]$",line)]
	if len(extract):
	answer[key] = "\n".join(extract)
	else:
	pass