Skip to content

Instantly share code, notes, and snippets.

@nutszebra
Created November 22, 2015 19:09
Show Gist options
  • Save nutszebra/02fa3d1089e39a492ba8 to your computer and use it in GitHub Desktop.
Save nutszebra/02fa3d1089e39a492ba8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/
"""
hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt
は,日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである.
以下の処理を行うプログラムを作成し,hightemp.txtを入力ファイルとして実行せよ.
Question 16:
16. ファイルをN分割する
自然数Nをコマンドライン引数などの手段で受け取り,入力のファイルを行単位でN分割せよ.
同様の処理をsplitコマンドで実現せよ
"""
import subprocess
import requests
import os
import re
def download(url, dir, params={}):
dl = requests.get(url, params=params)
with file(dir, "wb") as f:
f.write(dl.content)
import os
import mmap
import time
def countLine(path):
start = time.time()
file = os.open(path, os.O_RDONLY)
buf = mmap.mmap(file, 0, prot=mmap.PROT_READ)
answer = 0
readline = buf.readline
while readline():
answer += 1
end = time.time()
return (answer, end - start)
def readText(path, start = 1, end = float("Inf"), direction = "forward", block = None):
"""
path: file path to read
start: number of line to start reading
end: number of line to stop reading
start >= end if end >= 1
direction: order of lines that was read
block: number of output block
"""
#"3.4" --> 3
start = int(float(start))
block = int(float(block)) if not block == None else None
if not end == float("Inf"):
end = int(float(end))
def forward(a, b):
return a + b
def backward(a, b):
return b + a
if direction == "forward":
append = forward
else:
append = backward
#non-block case
if block == None:
answer = ""
count = 0
#start measuring
time_start = time.time()
with file(path, "r") as f:
for i in xrange(int(start)):
line = f.readline()
count = count + 1
while line:
answer = append(answer, line)
if count >= end:
break
line = f.readline()
count = count + 1
time_end = time.time()
return (answer, time_end - time_start)
#block case
else:
answer = ""
if end == float("Inf"):
end, time1 = countLine(path)
else:
time1 = 0
start = max(start, end -block + 1)
count = 0
#start measuring
time_start = time.time()
with file(path, "r") as f:
for i in xrange(int(start)):
line = f.readline()
count = count + 1
while line:
answer = append(answer, line)
if count >= end:
break
line = f.readline()
count = count + 1
time_end = time.time()
return (answer, time_end - time_start + time1)
def remove(path):
if os.path.exists(path):
os.remove(path)
def split(path, num):
cmd = "split -l " + str(num) + " " + path + " split."
subprocess.call(cmd, shell=True)
def saveFile(array, path):
with file(path, "wb") as f:
for ele in array:
f.write(ele + "\n")
#download hightemp.txt
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None
content = readText("./hightemp.txt")[0]
lines = raw_input("how many would you like to split a file into?: ")
lines = int(lines)
totalLine = countLine("./hightemp.txt")[0]
start = [int(round(totalLine / float(lines))) * i + 1 for i in xrange(0, lines)]
end = [a - 1 for a in start[1:]] + [totalLine]
for s, e, i in zip(start, end, range(0, lines)):
readContent = readText("./hightemp.txt",start = s, end = e)[0]
name = "split" + str(i) + ".txt"
saveFile(readContent.split("\n"), "./" + name)
print("original file: \n" + content)
for i in range(0, lines):
name = "split" + str(i) + ".txt"
readContent = readText("./" + name)[0]
print(name + ": \n" + readContent)
remove("./" + name)
print("split command")
split("./hightemp.txt", int(round(totalLine / float(lines))))
splitFile = [i for i in os.listdir("./") if re.findall(r"^split\..*",i)]
splitFile.sort()
for name in splitFile:
readContent = readText("./" + name)[0]
print(name + ": \n" + readContent)
remove("./" + name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment