Created
November 21, 2015 15:41
-
-
Save nutszebra/6a53618b09447f7a89d3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/ | |
""" | |
hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt | |
は,日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである. | |
以下の処理を行うプログラムを作成し,hightemp.txtを入力ファイルとして実行せよ. | |
Question 13: | |
13. col1.txtとcol2.txtをマージ | |
12で作ったcol1.txtとcol2.txtを結合し,元のファイルの1列目と2列目をタブ区切りで並べたテキストファイルを作成せよ. | |
確認にはpasteコマンドを用いよ. | |
""" | |
import subprocess | |
import requests | |
import os | |
def download(url, dir, params={}): | |
dl = requests.get(url, params=params) | |
with file(dir, "wb") as f: | |
f.write(dl.content) | |
def readFile(path): | |
answer = "" | |
with file(path, "r") as f: | |
for line in f.readlines(): | |
answer = answer + line | |
return answer | |
def remove(path): | |
if os.path.exists(path): | |
os.remove(path) | |
def convertPunctuationSymbol(punctuation): | |
if punctuation == "space" or punctuation == " ": | |
return " " | |
elif punctuation == "tab" or punctuation == "\t": | |
return "\t" | |
elif punctuation == "comma" or punctuation == ",": | |
return "," | |
elif punctuation == "period" or punctuation == ".": | |
return "." | |
elif punctuation == "double space" or punctuation == " ": | |
return " " | |
else: | |
return " " | |
def saveFile(array, path, punctuation = "space"): | |
punctuation = convertPunctuationSymbol(punctuation) | |
#array is empty | |
if len(array) == 0: | |
return None | |
#one column | |
if type(array[0]) == str: | |
howManyElementInLine = 0 | |
else: | |
#multiple columns | |
howManyElementInLine = len(array[0]) | |
with file(path, "wb") as f: | |
for i in xrange(0, len(array)): | |
#one columne | |
if howManyElementInLine == 0: | |
f.write(array[i] + "\n") | |
else: | |
#multiple column | |
for ii in xrange(0, howManyElementInLine - 1): | |
f.write(array[i][ii] + punctuation) | |
f.write(array[i][howManyElementInLine - 1] + "\n") | |
def encloseByQuotation(sentence): | |
if "'" in sentence: | |
quotation = '"' | |
else: | |
quotation = "'" | |
return quotation + sentence + quotation | |
def paste(array, path, punctuation = "space"): | |
punctuation = convertPunctuationSymbol(punctuation) | |
cmd = "paste -d" + encloseByQuotation(punctuation) + " " + " ".join(array) | |
with file(path, "wb") as f: | |
subprocess.call(cmd, stdout=f, shell=True) | |
#download hightemp.txt | |
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None | |
content = readFile("./hightemp.txt") | |
col1 = [val for line in content.split("\n") for val, index in zip(line.split("\t"), range(len(line))) if index == 0] | |
col2 = [val for line in content.split("\n") for val, index in zip(line.split("\t"), range(len(line))) if index == 1] | |
saveFile(col1, "./col1.txt") | |
saveFile(col2, "./col2.txt") | |
col1Saved = readFile("./col1.txt") | |
col2Saved = readFile("./col2.txt") | |
saveFile(zip(col1,col2), "./col12.txt", punctuation = "tab") | |
col12 = readFile("./col12.txt") | |
paste(["./col1.txt", "./col2.txt"], "./pasteCol12.txt", punctuation = "tab") | |
col12Paste = readFile("./pasteCol12.txt") | |
print("original file: \n" + content) | |
print("col1: \n" + "\n".join(col1) + "\n") | |
print("col2: \n" + "\n".join(col2) + "\n") | |
print("saved col1: \n" + col1Saved) | |
print("saved col2: \n" + col2Saved) | |
print("col1 was merged with col2: \n" + col12) | |
print("col1 was merged with col2 by paste command: \n" + col12Paste) | |
remove("./col1.txt") | |
remove("./col2.txt") | |
remove("./col12.txt") | |
remove("./pasteCol12.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment