Last active
November 20, 2015 14:27
-
-
Save nutszebra/10195847eaa28a1d7869 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
#Link: http://www.cl.ecei.tohoku.ac.jp/nlp100/ | |
""" | |
Question 06: | |
06. 集合 | |
"paraparaparadise"と"paragraph"に含まれる文字bi-gramの集合を, | |
それぞれ, XとYとして求め,XとYの和集合,積集合,差集合を求めよ. | |
さらに,'se'というbi-gramがXおよびYに含まれるかどうかを調べよ. | |
""" | |
"""*********************************************************** | |
100_questions_NLP_005より同様のn-gramの関数を定義する | |
***********************************************************""" | |
import re | |
def parseSentence(sentence): | |
return re.findall(r"[\w,']+|,|\.|!|:", sentence) | |
def nGram(target, n, option="letter"): | |
answer = [] | |
if option == "letter": | |
combination = len(target) - int(n) + 1 | |
if not combination >= 1: | |
return answer | |
else: | |
for i in xrange(0, combination): | |
answer.append(tuple(target[i:i + n])) | |
return answer | |
else: | |
parse = parseSentence(target) | |
combination = len(parse) - int(n) + 1 | |
if not combination >= 1: | |
return answer | |
else: | |
for i in xrange(0, combination): | |
answer.append(tuple(parse[i:i + n])) | |
return answer | |
"""********************************************************* | |
link: https://gist.github.com/nutszebra/5e29c345b700498bcc5b | |
*********************************************************""" | |
X = set(nGram("paraparaparadise",2)) | |
Y = set(nGram("paragraph", 2)) | |
XPlusY = X.union(Y) | |
XIntersectY = X.intersection(Y) | |
XMinusY = X.difference(Y) | |
YMinusX = Y.difference(X) | |
se = set([("s","e")]) | |
print(u"X集合: {0}".format(list(X))) | |
print(u"Y集合: {0}".format(list(Y))) | |
print(u"和集合: {0}".format(list(XPlusY))) | |
print(u"X-Y差集合: {0}".format(list(XMinusY))) | |
print(u"Y-X差集合: {0}".format(list(YMinusX))) | |
print(u"積集合: {0}".format(list(XIntersectY))) | |
if len(se.intersection(XPlusY)): | |
print("seのバイグラムはX,Yに含まれる") | |
else: | |
print("seのバイグラムはX,Yに含まれない") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment