nutszebra/100_questions_NLP_023

## 100_questions_NLP_023
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

"""
Wikipediaの記事を以下のフォーマットで書き出したファイルjawiki-country.json.gzがある．
link: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
1行に1記事の情報がJSON形式で格納される
各行には記事名が"title"キーに，記事本文が"text"キーの辞書オブジェクトに格納され，そのオブジェクトがJSON形式で書き出される
ファイル全体はgzipで圧縮される
以下の処理を行うプログラムを作成せよ．
Question 23:
23. セクション構造
記事中に含まれるセクション名とそのレベル（例えば"== セクション名 =="なら1）を表示せよ．
"""

import subprocess
import requests
import os
import re
import json

def download(url, dir, params={}):
  dl = requests.get(url, params=params)
  with file(dir, "wb") as f:
    f.write(dl.content)

def gunzip(path):
  cmd = "gunzip " + path
  subprocess.call(cmd, shell=True)

#download http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz","./wikipedia.json.gz") if not os.path.exists("./wikipedia.json") else None
if not os.path.exists("wikipedia.json"):
  gunzip("./wikipedia.json.gz")
content = {}
i=0
with open('wikipedia.json', 'r') as f:
  for line in f:
    content[i]=json.loads(line)
    i+=1

keyword = [u"イギリス", u"英国", u"British"]

result = {}
for article in content:
  for word in keyword:
    if word in content[article]["text"] or word in content[article]["text"]:
      result[article] = content[article]
      break
"""
************************************************************************************
The code until here is the code that was written for question 20, and I am resusing
Link: https://gist.github.com/nutszebra/69ae8e21d576c03581f1
************************************************************************************
"""

#例: == 国号 ==

answer = {}
for key in result:
  tmp = re.findall(r"==+.*==+\n",result[key]["text"])
  level = [int((section.count("=") -2)/2) for section in tmp]
  answer[key] = [(re.sub(r"=|\s","",s),l)  for s, l in zip(tmp,level)]

for key in answer:
  print("***************************************")
  print("article number: " + str(key))
  for s,l in answer[key]:
    print("section: " + s + ", level: " + str(l))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

	"""
	Wikipediaの記事を以下のフォーマットで書き出したファイルjawiki-country.json.gzがある．
	link: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
	1行に1記事の情報がJSON形式で格納される
	各行には記事名が"title"キーに，記事本文が"text"キーの辞書オブジェクトに格納され，そのオブジェクトがJSON形式で書き出される
	ファイル全体はgzipで圧縮される
	以下の処理を行うプログラムを作成せよ．
	Question 23:
	23. セクション構造
	記事中に含まれるセクション名とそのレベル（例えば"== セクション名 =="なら1）を表示せよ．
	"""

	import subprocess
	import requests
	import os
	import re
	import json

	def download(url, dir, params={}):
	dl = requests.get(url, params=params)
	with file(dir, "wb") as f:
	f.write(dl.content)

	def gunzip(path):
	cmd = "gunzip " + path
	subprocess.call(cmd, shell=True)

	#download http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz
	download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/jawiki-country.json.gz","./wikipedia.json.gz") if not os.path.exists("./wikipedia.json") else None
	if not os.path.exists("wikipedia.json"):
	gunzip("./wikipedia.json.gz")
	content = {}
	i=0
	with open('wikipedia.json', 'r') as f:
	for line in f:
	content[i]=json.loads(line)
	i+=1

	keyword = [u"イギリス", u"英国", u"British"]

	result = {}
	for article in content:
	for word in keyword:
	if word in content[article]["text"] or word in content[article]["text"]:
	result[article] = content[article]
	break
	"""
	************************************************************************************
	The code until here is the code that was written for question 20, and I am resusing
	Link: https://gist.github.com/nutszebra/69ae8e21d576c03581f1
	************************************************************************************
	"""

	#例: == 国号 ==

	answer = {}
	for key in result:
	tmp = re.findall(r"==+.*==+\n",result[key]["text"])
	level = [int((section.count("=") -2)/2) for section in tmp]
	answer[key] = [(re.sub(r"=\|\s","",s),l) for s, l in zip(tmp,level)]

	for key in answer:
	print("***************************************")
	print("article number: " + str(key))
	for s,l in answer[key]:
	print("section: " + s + ", level: " + str(l))