nutszebra/100_questions_NLP_019

## 100_questions_NLP_019
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

"""
hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt
は，日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである．
以下の処理を行うプログラムを作成し，hightemp.txtを入力ファイルとして実行せよ．
Question 19:
19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる
各行の1列目の文字列の出現頻度を求め，その高い順に並べて表示せよ．確認にはcut, uniq, sortコマンドを用いよ．
"""

import subprocess
import requests
import os
import re

def download(url, dir, params={}):
  dl = requests.get(url, params=params)
  with file(dir, "wb") as f:
    f.write(dl.content)

import os
import mmap
import time

def countLine(path):
  start = time.time()
  file = os.open(path, os.O_RDONLY)
  buf = mmap.mmap(file, 0, prot=mmap.PROT_READ)
  answer = 0
  readline = buf.readline
  while readline():
    answer += 1
  end = time.time()
  return (answer, end - start)

def readText(path, start = 1, end = float("Inf"), direction = "forward", block = None):
  """
  path: file path to read
  start: number of line to start reading
  end: number of line to stop reading
  start >= end if end >= 1
  direction: order of lines that was read
  block: number of output block
  """

  #"3.4" --> 3
  start = int(float(start))
  block = int(float(block)) if not block == None else None
  if not end == float("Inf"):
    end = int(float(end))

  def forward(a, b):
    return a + b

  def backward(a, b):
    return b + a

  if direction == "forward":
    append = forward
  else:
    append = backward

  #non-block case
  if block == None:
    answer = ""
    count = 0
    #start measuring
    time_start = time.time()
    with file(path, "r") as f:
      for i in xrange(int(start)):
        line = f.readline()
        count = count + 1
      while line:
        answer = append(answer, line)
        if count >= end:
          break
        line = f.readline()
        count = count + 1
    time_end = time.time()
    return (answer, time_end - time_start)
  #block case
  else:
    answer = ""
    if end == float("Inf"):
      end, time1 = countLine(path)
    else:
      time1 = 0
    start = max(start, end -block + 1)
    count = 0
    #start measuring
    time_start = time.time()
    with file(path, "r") as f:
      for i in xrange(int(start)):
        line = f.readline()
        count = count + 1
      while line:
        answer = append(answer, line)
        if count >= end:
          break
        line = f.readline()
        count = count + 1
    time_end = time.time()
    return (answer, time_end - time_start + time1)

def remove(path):
  if os.path.exists(path):
    os.remove(path)

def calculateFrequencyColumn(path, column):
  cmd = "cat " + path +"|cut -f" + str(column) + "|sort|uniq -c|sort -r"
  with file("./calculateFrequencyColumn.txt", "wb") as f:
    subprocess.call(cmd, stdout=f, shell=True)
  with file("./calculateFrequencyColumn.txt", "r") as f:
    answer = readText("calculateFrequencyColumn.txt")[0]
    remove("./calculateFrequencyColumn.txt")
  return answer

#download hightemp.txt
download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None

content = readText("./hightemp.txt")[0]
content_split = [ele.split("\t") for ele in content.split("\n")][:-1]
frequency = {}
for ele in content_split:
  if not ele[0] in frequency:
    frequency[ele[0]] = 1
  else:
    frequency[ele[0]] += 1

print("original file: \n" + content)
print("count frequency: \n" + "\n".join([str(ele[1]) +" "+  ele[0] for ele in sorted(frequency.items(), key=lambda x:x[1], reverse = True)]))
print("cut sort uniq command: \n" + calculateFrequencyColumn("./hightemp.txt",1))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

	"""
	hightemp.txt: http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt
	は，日本の最高気温の記録を「都道府県」「地点」「℃」「日」のタブ区切り形式で格納したファイルである．
	以下の処理を行うプログラムを作成し，hightemp.txtを入力ファイルとして実行せよ．
	Question 19:
	19. 各行の1コラム目の文字列の出現頻度を求め，出現頻度の高い順に並べる
	各行の1列目の文字列の出現頻度を求め，その高い順に並べて表示せよ．確認にはcut, uniq, sortコマンドを用いよ．
	"""

	import subprocess
	import requests
	import os
	import re

	def download(url, dir, params={}):
	dl = requests.get(url, params=params)
	with file(dir, "wb") as f:
	f.write(dl.content)

	import os
	import mmap
	import time

	def countLine(path):
	start = time.time()
	file = os.open(path, os.O_RDONLY)
	buf = mmap.mmap(file, 0, prot=mmap.PROT_READ)
	answer = 0
	readline = buf.readline
	while readline():
	answer += 1
	end = time.time()
	return (answer, end - start)

	def readText(path, start = 1, end = float("Inf"), direction = "forward", block = None):
	"""
	path: file path to read
	start: number of line to start reading
	end: number of line to stop reading
	start >= end if end >= 1
	direction: order of lines that was read
	block: number of output block
	"""

	#"3.4" --> 3
	start = int(float(start))
	block = int(float(block)) if not block == None else None
	if not end == float("Inf"):
	end = int(float(end))

	def forward(a, b):
	return a + b

	def backward(a, b):
	return b + a

	if direction == "forward":
	append = forward
	else:
	append = backward

	#non-block case
	if block == None:
	answer = ""
	count = 0
	#start measuring
	time_start = time.time()
	with file(path, "r") as f:
	for i in xrange(int(start)):
	line = f.readline()
	count = count + 1
	while line:
	answer = append(answer, line)
	if count >= end:
	break
	line = f.readline()
	count = count + 1
	time_end = time.time()
	return (answer, time_end - time_start)
	#block case
	else:
	answer = ""
	if end == float("Inf"):
	end, time1 = countLine(path)
	else:
	time1 = 0
	start = max(start, end -block + 1)
	count = 0
	#start measuring
	time_start = time.time()
	with file(path, "r") as f:
	for i in xrange(int(start)):
	line = f.readline()
	count = count + 1
	while line:
	answer = append(answer, line)
	if count >= end:
	break
	line = f.readline()
	count = count + 1
	time_end = time.time()
	return (answer, time_end - time_start + time1)

	def remove(path):
	if os.path.exists(path):
	os.remove(path)

	def calculateFrequencyColumn(path, column):
	cmd = "cat " + path +"\|cut -f" + str(column) + "\|sort\|uniq -c\|sort -r"
	with file("./calculateFrequencyColumn.txt", "wb") as f:
	subprocess.call(cmd, stdout=f, shell=True)
	with file("./calculateFrequencyColumn.txt", "r") as f:
	answer = readText("calculateFrequencyColumn.txt")[0]
	remove("./calculateFrequencyColumn.txt")
	return answer

	#download hightemp.txt
	download("http://www.cl.ecei.tohoku.ac.jp/nlp100/data/hightemp.txt","./hightemp.txt") if not os.path.exists("./hightemp.txt") else None

	content = readText("./hightemp.txt")[0]
	content_split = [ele.split("\t") for ele in content.split("\n")][:-1]
	frequency = {}
	for ele in content_split:
	if not ele[0] in frequency:
	frequency[ele[0]] = 1
	else:
	frequency[ele[0]] += 1

	print("original file: \n" + content)
	print("count frequency: \n" + "\n".join([str(ele[1]) +" "+ ele[0] for ele in sorted(frequency.items(), key=lambda x:x[1], reverse = True)]))
	print("cut sort uniq command: \n" + calculateFrequencyColumn("./hightemp.txt",1))