Skip to content

Instantly share code, notes, and snippets.

@nutszebra
Last active November 20, 2015 14:27
Show Gist options
  • Save nutszebra/5e29c345b700498bcc5b to your computer and use it in GitHub Desktop.
Save nutszebra/5e29c345b700498bcc5b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/
"""
Question 05:
05. n-gram
与えられたシーケンス(文字列やリストなど)からn-gramを作る関数を作成せよ.
この関数を用い,"I am an NLPer"という文から単語bi-gram,文字bi-gramを得よ.
"""
import re
#文のパースのルールを書く。, . !を1文字と区別する
def parseSentence(sentence):
return re.findall(r"[\w,']+|,|\.|!|:", sentence)
"""
optionがletterの場合は文字列レベルのn-gramを行い、それ以外のoptionでは単語レベルのn-gramを行う
例:
In [165]: nGram("I am an NLPer",2,option="sentence")
Out[165]: [('I', 'am'), ('am', 'an'), ('an', 'NLPer')]
In [166]: nGram("abcdefg",2)
Out[166]: [('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e'), ('e', 'f'), ('f', 'g')]
In [167]: nGram("abcdefg",100)
Out[167]: []
In [168]: nGram("I am an NLPer",200,option="sentence")
Out[168]: []
"""
def nGram(target, n, option="letter"):
answer = []
if option == "letter":
combination = len(target) - int(n) + 1
if not combination >= 1:
return answer
else:
for i in xrange(0, combination):
answer.append(tuple(target[i:i + n]))
return answer
else:
parse = parseSentence(target)
combination = len(parse) - int(n) + 1
if not combination >= 1:
return answer
else:
for i in xrange(0, combination):
answer.append(tuple(parse[i:i + n]))
return answer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment