Skip to content

Instantly share code, notes, and snippets.

@isdyy
Last active December 19, 2015 21:39
Show Gist options
  • Save isdyy/6022027 to your computer and use it in GitHub Desktop.
Save isdyy/6022027 to your computer and use it in GitHub Desktop.
ngram実験
# -*- coding: utf-8 -*-
def ngram_word_split(u):
return u.split()
def ngram_split(u, n):
length = len(u)
pos = 0
res = []
while pos < length:
ngram = u[pos:pos+n]
if len(ngram.strip()) > 1:
res.append(ngram)
pos += 1
return res
def ngram_print(u):
for w in ngram_word_split(u):
l = len(w)
n = 2
while n <= l:
for s in ngram_split(w, n):
print s
n += 1
ngram_print(u'やまざき まさよし')
print '--'
ngram_print(u'たかはし ゆきひろ')
print '--'
ngram_print(u'')
print '--'
ngram_print(u' ')
# 重複除去が必要
まざ
ざき
やまざ
まざき
ざき
やまざき
まざき
ざき
まさ
さよ
よし
まさよ
さよし
よし
まさよし
さよし
よし
--
たか
かは
はし
たかは
かはし
はし
たかはし
かはし
はし
ゆき
きひ
ひろ
ゆきひ
きひろ
ひろ
ゆきひろ
きひろ
ひろ
--
--
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment