import codecs
#處理編碼的套件
import operator
##處理字典檔排序的套件

text = codecs.open("text.txt","r","utf-8")
#讀取存成TXT檔的文字，讀入後統一轉成UTF-8格式

text_new =""
for line in text.readlines():
    text_new += "".join(line.split('\n'))
#在這邊先做一個小處理，把不同行的文章串接再一起，如果未來要做一些去除標點符號的處理也會是在這邊。

def ngram(text,n): #第一個參數放處理好的文章，第二個參數放字詞的長度單位

    words=[]     #存放擷取出來的字詞
    words_freq={}#存放字詞:計算個數 
    
    for w in range(len(text)-(n-1)): #要讀取的長度隨字詞長度改變
        words.append(text[w:w+n])    #抓取長度w-(n-1)的字串

    for word in words:
        if word not in words_freq:               #如果這個字詞還沒有被放在字典檔中
            words_freq[word] = words.count(word) #就開一個新的字詞，裡面放入字詞計算的頻次

    words_freq = sorted(words_freq.iteritems(),key=operator.itemgetter(1),reverse=True) #change words_freq from dict to list 
    return words_freq

words_freqs = ngram(text_new,3)

for i in words_freqs:
    print i[0],i[1]
    
'''
道：" 35
笑道： 13
"那僧 9
聽了， 8
"士隱 8
。士隱 7
。"那 7
那僧道 6
....
'''

words_freqs = ngram(text_new,2)

for i in words_freqs:
    print i[0],i[1]
    
'''
：" 45
道： 36
士隱 33
雨村 25
，不 24
。" 22
那僧 17
，便 16
...
'''