nutszebra/100_questions_NLP_005

## 100_questions_NLP_005
#!/usr/bin/env python
# -*- coding: utf-8 -*-

#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

"""
Question 05:
05. n-gram
与えられたシーケンス（文字列やリストなど）からn-gramを作る関数を作成せよ．
この関数を用い，"I am an NLPer"という文から単語bi-gram，文字bi-gramを得よ．
"""

import re

#文のパースのルールを書く。, . !を1文字と区別する
def parseSentence(sentence):
  return re.findall(r"[\w,']+|,|\.|!|:", sentence)

"""
optionがletterの場合は文字列レベルのn-gramを行い、それ以外のoptionでは単語レベルのn-gramを行う
例:
In [165]: nGram("I am an NLPer",2,option="sentence")
Out[165]: [('I', 'am'), ('am', 'an'), ('an', 'NLPer')]
In [166]: nGram("abcdefg",2)
Out[166]: [('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e'), ('e', 'f'), ('f', 'g')]
In [167]: nGram("abcdefg",100)
Out[167]: []
In [168]: nGram("I am an NLPer",200,option="sentence")
Out[168]: []
"""

def nGram(target, n, option="letter"):
  answer = []
  if option == "letter":
    combination = len(target) - int(n) + 1
    if not combination >= 1:
      return answer
    else:
      for i in xrange(0, combination):
        answer.append(tuple(target[i:i + n]))
      return answer
  else:
    parse = parseSentence(target)
    combination = len(parse) - int(n) + 1
    if not combination >= 1:
      return answer
    else:
      for i in xrange(0, combination):
        answer.append(tuple(parse[i:i + n]))
      return answer
	#!/usr/bin/env python
	# -- coding: utf-8 --

	#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/

	"""
	Question 05:
	05. n-gram
	与えられたシーケンス（文字列やリストなど）からn-gramを作る関数を作成せよ．
	この関数を用い，"I am an NLPer"という文から単語bi-gram，文字bi-gramを得よ．
	"""

	import re

	#文のパースのルールを書く。, . !を1文字と区別する
	def parseSentence(sentence):
	return re.findall(r"[\w,']+\|,\|\.\|!\|:", sentence)

	"""
	optionがletterの場合は文字列レベルのn-gramを行い、それ以外のoptionでは単語レベルのn-gramを行う
	例:
	In [165]: nGram("I am an NLPer",2,option="sentence")
	Out[165]: [('I', 'am'), ('am', 'an'), ('an', 'NLPer')]
	In [166]: nGram("abcdefg",2)
	Out[166]: [('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e'), ('e', 'f'), ('f', 'g')]
	In [167]: nGram("abcdefg",100)
	Out[167]: []
	In [168]: nGram("I am an NLPer",200,option="sentence")
	Out[168]: []
	"""

	def nGram(target, n, option="letter"):
	answer = []
	if option == "letter":
	combination = len(target) - int(n) + 1
	if not combination >= 1:
	return answer
	else:
	for i in xrange(0, combination):
	answer.append(tuple(target[i:i + n]))
	return answer
	else:
	parse = parseSentence(target)
	combination = len(parse) - int(n) + 1
	if not combination >= 1:
	return answer
	else:
	for i in xrange(0, combination):
	answer.append(tuple(parse[i:i + n]))
	return answer