nutszebra/100_questions_NLP_013

## 100_questions_NLP_013
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

"""
hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt
は，日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである．
以下の処理を行うプログラムを作成し，hightemp.txtを入力ファイルとして実行せよ．
Question 13:
13. col1.txtとcol2.txtをマージ
12で作ったcol1.txtとcol2.txtを結合し，元のファイルの1列目と2列目をタブ区切りで並べたテキストファイルを作成せよ．
確認にはpasteコマンドを用いよ．
"""

import subprocess
import requests
import os

def download(url, dir, params={}):
  dl = requests.get(url, params=params)
  with file(dir, "wb") as f:
    f.write(dl.content)

def readFile(path):
  answer = ""
  with file(path, "r") as f:
    for line in f.readlines():
      answer = answer + line
  return answer

def remove(path):
  if os.path.exists(path):
    os.remove(path)

def convertPunctuationSymbol(punctuation):
  if punctuation == "space" or punctuation == " ":
    return " "
  elif punctuation == "tab" or punctuation == "\t":
    return "\t"
  elif punctuation == "comma" or punctuation == ",":
    return ","
  elif punctuation == "period" or punctuation == ".":
    return "."
  elif punctuation == "double space" or punctuation == "  ":
    return "  "
  else:
    return " "

def saveFile(array, path, punctuation = "space"):
  punctuation = convertPunctuationSymbol(punctuation)
  #array is empty
  if len(array) == 0:
    return None
  #one column
  if type(array[0]) == str:
    howManyElementInLine = 0
  else:
  #multiple columns
    howManyElementInLine = len(array[0])
  with file(path, "wb") as f:
    for i in xrange(0, len(array)):
      #one columne
      if howManyElementInLine == 0:
        f.write(array[i] + "\n")
      else:
      #multiple column
        for ii in xrange(0, howManyElementInLine - 1):
          f.write(array[i][ii] + punctuation)
        f.write(array[i][howManyElementInLine - 1] + "\n")

def encloseByQuotation(sentence):
  if "'" in sentence:
    quotation = '"'
  else:
    quotation = "'"
  return quotation + sentence + quotation

def paste(array, path, punctuation = "space"):
  punctuation = convertPunctuationSymbol(punctuation)
  cmd = "paste -d" + encloseByQuotation(punctuation) + " " + " ".join(array)
  with file(path, "wb") as f:
    subprocess.call(cmd, stdout=f, shell=True)

#download hightemp.txt
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None

content = readFile("./hightemp.txt")
col1 = [val for line in content.split("\n") for val, index in zip(line.split("\t"), range(len(line))) if index == 0]
col2 = [val for line in content.split("\n") for val, index in zip(line.split("\t"), range(len(line))) if index == 1]
saveFile(col1, "./col1.txt")
saveFile(col2, "./col2.txt")
col1Saved = readFile("./col1.txt")
col2Saved = readFile("./col2.txt")
saveFile(zip(col1,col2), "./col12.txt", punctuation = "tab")
col12 = readFile("./col12.txt")
paste(["./col1.txt", "./col2.txt"], "./pasteCol12.txt", punctuation = "tab")
col12Paste = readFile("./pasteCol12.txt")

print("original file: \n" + content)
print("col1: \n" + "\n".join(col1) + "\n")
print("col2: \n" + "\n".join(col2) + "\n")
print("saved col1: \n" + col1Saved)
print("saved col2: \n" + col2Saved)
print("col1 was merged with col2: \n" + col12)
print("col1 was merged with col2 by paste command: \n" + col12Paste)
remove("./col1.txt")
remove("./col2.txt")
remove("./col12.txt")
remove("./pasteCol12.txt")
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

	"""
	hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt
	は，日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである．
	以下の処理を行うプログラムを作成し，hightemp.txtを入力ファイルとして実行せよ．
	Question 13:
	13. col1.txtとcol2.txtをマージ
	12で作ったcol1.txtとcol2.txtを結合し，元のファイルの1列目と2列目をタブ区切りで並べたテキストファイルを作成せよ．
	確認にはpasteコマンドを用いよ．
	"""

	import subprocess
	import requests
	import os

	def download(url, dir, params={}):
	dl = requests.get(url, params=params)
	with file(dir, "wb") as f:
	f.write(dl.content)

	def readFile(path):
	answer = ""
	with file(path, "r") as f:
	for line in f.readlines():
	answer = answer + line
	return answer

	def remove(path):
	if os.path.exists(path):
	os.remove(path)

	def convertPunctuationSymbol(punctuation):
	if punctuation == "space" or punctuation == " ":
	return " "
	elif punctuation == "tab" or punctuation == "\t":
	return "\t"
	elif punctuation == "comma" or punctuation == ",":
	return ","
	elif punctuation == "period" or punctuation == ".":
	return "."
	elif punctuation == "double space" or punctuation == " ":
	return " "
	else:
	return " "

	def saveFile(array, path, punctuation = "space"):
	punctuation = convertPunctuationSymbol(punctuation)
	#array is empty
	if len(array) == 0:
	return None
	#one column
	if type(array[0]) == str:
	howManyElementInLine = 0
	else:
	#multiple columns
	howManyElementInLine = len(array[0])
	with file(path, "wb") as f:
	for i in xrange(0, len(array)):
	#one columne
	if howManyElementInLine == 0:
	f.write(array[i] + "\n")
	else:
	#multiple column
	for ii in xrange(0, howManyElementInLine - 1):
	f.write(array[i][ii] + punctuation)
	f.write(array[i][howManyElementInLine - 1] + "\n")

	def encloseByQuotation(sentence):
	if "'" in sentence:
	quotation = '"'
	else:
	quotation = "'"
	return quotation + sentence + quotation

	def paste(array, path, punctuation = "space"):
	punctuation = convertPunctuationSymbol(punctuation)
	cmd = "paste -d" + encloseByQuotation(punctuation) + " " + " ".join(array)
	with file(path, "wb") as f:
	subprocess.call(cmd, stdout=f, shell=True)

	#download hightemp.txt
	download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None

	content = readFile("./hightemp.txt")
	col1 = [val for line in content.split("\n") for val, index in zip(line.split("\t"), range(len(line))) if index == 0]
	col2 = [val for line in content.split("\n") for val, index in zip(line.split("\t"), range(len(line))) if index == 1]
	saveFile(col1, "./col1.txt")
	saveFile(col2, "./col2.txt")
	col1Saved = readFile("./col1.txt")
	col2Saved = readFile("./col2.txt")
	saveFile(zip(col1,col2), "./col12.txt", punctuation = "tab")
	col12 = readFile("./col12.txt")
	paste(["./col1.txt", "./col2.txt"], "./pasteCol12.txt", punctuation = "tab")
	col12Paste = readFile("./pasteCol12.txt")

	print("original file: \n" + content)
	print("col1: \n" + "\n".join(col1) + "\n")
	print("col2: \n" + "\n".join(col2) + "\n")
	print("saved col1: \n" + col1Saved)
	print("saved col2: \n" + col2Saved)
	print("col1 was merged with col2: \n" + col12)
	print("col1 was merged with col2 by paste command: \n" + col12Paste)
	remove("./col1.txt")
	remove("./col2.txt")
	remove("./col12.txt")
	remove("./pasteCol12.txt")