Created
November 22, 2015 19:09
-
-
Save nutszebra/02fa3d1089e39a492ba8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/ | |
""" | |
hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt | |
は,日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである. | |
以下の処理を行うプログラムを作成し,hightemp.txtを入力ファイルとして実行せよ. | |
Question 16: | |
16. ファイルをN分割する | |
自然数Nをコマンドライン引数などの手段で受け取り,入力のファイルを行単位でN分割せよ. | |
同様の処理をsplitコマンドで実現せよ | |
""" | |
import subprocess | |
import requests | |
import os | |
import re | |
def download(url, dir, params={}): | |
dl = requests.get(url, params=params) | |
with file(dir, "wb") as f: | |
f.write(dl.content) | |
import os | |
import mmap | |
import time | |
def countLine(path): | |
start = time.time() | |
file = os.open(path, os.O_RDONLY) | |
buf = mmap.mmap(file, 0, prot=mmap.PROT_READ) | |
answer = 0 | |
readline = buf.readline | |
while readline(): | |
answer += 1 | |
end = time.time() | |
return (answer, end - start) | |
def readText(path, start = 1, end = float("Inf"), direction = "forward", block = None): | |
""" | |
path: file path to read | |
start: number of line to start reading | |
end: number of line to stop reading | |
start >= end if end >= 1 | |
direction: order of lines that was read | |
block: number of output block | |
""" | |
#"3.4" --> 3 | |
start = int(float(start)) | |
block = int(float(block)) if not block == None else None | |
if not end == float("Inf"): | |
end = int(float(end)) | |
def forward(a, b): | |
return a + b | |
def backward(a, b): | |
return b + a | |
if direction == "forward": | |
append = forward | |
else: | |
append = backward | |
#non-block case | |
if block == None: | |
answer = "" | |
count = 0 | |
#start measuring | |
time_start = time.time() | |
with file(path, "r") as f: | |
for i in xrange(int(start)): | |
line = f.readline() | |
count = count + 1 | |
while line: | |
answer = append(answer, line) | |
if count >= end: | |
break | |
line = f.readline() | |
count = count + 1 | |
time_end = time.time() | |
return (answer, time_end - time_start) | |
#block case | |
else: | |
answer = "" | |
if end == float("Inf"): | |
end, time1 = countLine(path) | |
else: | |
time1 = 0 | |
start = max(start, end -block + 1) | |
count = 0 | |
#start measuring | |
time_start = time.time() | |
with file(path, "r") as f: | |
for i in xrange(int(start)): | |
line = f.readline() | |
count = count + 1 | |
while line: | |
answer = append(answer, line) | |
if count >= end: | |
break | |
line = f.readline() | |
count = count + 1 | |
time_end = time.time() | |
return (answer, time_end - time_start + time1) | |
def remove(path): | |
if os.path.exists(path): | |
os.remove(path) | |
def split(path, num): | |
cmd = "split -l " + str(num) + " " + path + " split." | |
subprocess.call(cmd, shell=True) | |
def saveFile(array, path): | |
with file(path, "wb") as f: | |
for ele in array: | |
f.write(ele + "\n") | |
#download hightemp.txt | |
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None | |
content = readText("./hightemp.txt")[0] | |
lines = raw_input("how many would you like to split a file into?: ") | |
lines = int(lines) | |
totalLine = countLine("./hightemp.txt")[0] | |
start = [int(round(totalLine / float(lines))) * i + 1 for i in xrange(0, lines)] | |
end = [a - 1 for a in start[1:]] + [totalLine] | |
for s, e, i in zip(start, end, range(0, lines)): | |
readContent = readText("./hightemp.txt",start = s, end = e)[0] | |
name = "split" + str(i) + ".txt" | |
saveFile(readContent.split("\n"), "./" + name) | |
print("original file: \n" + content) | |
for i in range(0, lines): | |
name = "split" + str(i) + ".txt" | |
readContent = readText("./" + name)[0] | |
print(name + ": \n" + readContent) | |
remove("./" + name) | |
print("split command") | |
split("./hightemp.txt", int(round(totalLine / float(lines)))) | |
splitFile = [i for i in os.listdir("./") if re.findall(r"^split\..*",i)] | |
splitFile.sort() | |
for name in splitFile: | |
readContent = readText("./" + name)[0] | |
print(name + ": \n" + readContent) | |
remove("./" + name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment