Skip to content

Instantly share code, notes, and snippets.

@pedrokoblitz
Last active December 23, 2015 20:09
Show Gist options
  • Save pedrokoblitz/6687983 to your computer and use it in GitHub Desktop.
Save pedrokoblitz/6687983 to your computer and use it in GitHub Desktop.
class Classificador (object):
def __init__(self):
tsents = mac_morpho.tagged_sents()
tsents = [[(w.lower(),t) for (w,t) in sent] for sent in tsents if sent]
tagger0 = nltk.DefaultTagger('N')
tagger1 = nltk.UnigramTagger(tsents[100:], backoff=tagger0)
self.tagger = nltk.BigramTagger(tsents[100:], backoff=tagger1)
#classifica as palavras do texto
def classificar(self,frases):
self.texto_classificado = self.tagger.tag(frases)
#devolve lista de palavras com determinada tag
def filtrar(self,tag):
filtrado = []
for palavra in self.texto_classificado:
if palavra[1] == tag:
filtrado.append(palavra[0])
return filtrado
class ConcordanceIndex(object):
"""
An index that can be used to look up the offset locations at which
a given word occurs in a document.
"""
def __init__(self, tokens, key=lambda x:x):
"""
Construct a new concordance index.
@param tokens: The document (list of tokens) that this
concordance index was created from. This list can be used
to access the context of a given word occurance.
@param key: A function that maps each token to a normalized
version that will be used as a key in the index. E.g., if
you use C{key=lambda s:s.lower()}, then the index will be
case-insensitive.
"""
self._tokens = tokens
"""The document (list of tokens) that this concordance index
was created from."""
self._key = key
"""Function mapping each token to an index key (or None)."""
self._offsets = defaultdict(list)
"""Dictionary mapping words (or keys) to lists of offset
indices."""
# Initialize the index (self._offsets)
for index, word in enumerate(tokens):
word = self._key(word)
self._offsets[word].append(index)
def tokens(self):
"""
@rtype: C{list} of token
@return: The document that this concordance index was
created from.
"""
return self._tokens
def offsets(self, word):
"""
@rtype: C{list} of C{int}
@return: A list of the offset positions at which the given
word occurs. If a key function was specified for the
index, then given word's key will be looked up.
"""
word = self._key(word)
return self._offsets[word]
def __repr__(self):
return '<ConcordanceIndex for %d tokens (%d types)>' % (
len(self._tokens), len(self._offsets))
def generate_concordance(self, word, width=75, lines=25):
"""
Print a concordance for C{word} with the specified context window.
@param word: The target word
@type word: C{str}
@param width: The width of each line, in characters (default=80)
@type width: C{int}
@param lines: The number of lines to display (default=25)
@type lines: C{int}
"""
half_width = (width - len(word) - 2) / 2
context = width/4 # approx number of words of context
offsets = self.offsets(word)
if offsets:
lines = min(lines, len(offsets))
# print "Displaying %s of %s matches:" % (lines, len(offsets))
contexto = []
for i in offsets:
if lines <= 0:
break
left = (' ' * half_width +
' '.join(self._tokens[i-context:i]))
right = ' '.join(self._tokens[i+1:i+context])
left = left[-half_width:]
right = right[:half_width]
contexto.append( left +' '+ self._tokens[i] +' '+ right )
lines -= 1
self.contexto = contexto
else:
print "No matches"
class ContextIndex(object):
"""
A bidirectional index between words and their 'contexts' in a text.
The context of a word is usually defined to be the words that occur
in a fixed window around the word; but other definitions may also
be used by providing a custom context function.
"""
@staticmethod
def _default_context(tokens, i):
"""One left token and one right token, normalized to lowercase"""
if i == 0: left = '*START*'
else: left = tokens[i-1].lower()
if i == len(tokens) - 1: right = '*END*'
else: right = tokens[i+1].lower()
return (left, right)
def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x):
self._key = key
self._tokens = tokens
if not context_func:
self._context_func = self._default_context
if filter:
tokens = [t for t in tokens if filter(t)]
self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i))
for i, w in enumerate(tokens))
self._context_to_words = CFD((self._context_func(tokens, i), self._key(w))
for i, w in enumerate(tokens))
def tokens(self):
"""
@rtype: C{list} of token
@return: The document that this context index was
created from.
"""
return self._tokens
def word_similarity_dict(self, word):
"""
Return a dictionary mapping from words to 'similarity scores,'
indicating how often these two words occur in the same
context.
"""
word = self._key(word)
word_contexts = set(self._word_to_contexts[word])
scores = {}
for w, w_contexts in self._word_to_contexts.items():
scores[w] = f_measure(word_contexts, set(w_contexts))
return scores
def similar_words(self, word, n=20):
scores = defaultdict(int)
for c in self._word_to_contexts[self._key(word)]:
for w in self._context_to_words[c]:
if w != word:
print w, c, self._context_to_words[c][word], self._context_to_words[c][w]
scores[w] += self._context_to_words[c][word] * self._context_to_words[c][w]
return sorted(scores, key=scores.get)[:n]
def common_contexts(self, words, fail_on_unknown=False):
"""
Find contexts where the specified words can all appear; and
return a frequency distribution mapping each context to the
number of times that context was used.
@param words: The words used to seed the similarity search
@type words: C{str}
@param fail_on_unknown: If true, then raise a value error if
any of the given words do not occur at all in the index.
"""
words = [self._key(w) for w in words]
contexts = [set(self._word_to_contexts[w]) for w in words]
empty = [words[i] for i in range(len(words)) if not contexts[i]]
common = reduce(set.intersection, contexts)
if empty and fail_on_unknown:
raise ValueError("The following word(s) were not found:",
" ".join(words))
elif not common:
# nothing in common -- just return an empty freqdist.
return FreqDist()
else:
fd = FreqDist(c for w in words
for c in self._word_to_contexts[w]
if c in common)
return fd
class Ngrama (object):
def __init__(self, palavras):
self.palavras = palavras
#checa ocorrencia de duplas de palavras e retorna uma lista de tuplas
def set_bigramas(self,freq=3, best=50):
bcf = BigramCollocationFinder.from_words(self.palavras)
stopset = set(stopwords.words('portuguese'))
filter_stops = lambda w: len(w) < 3 or w in stopset
bcf.apply_word_filter(filter_stops)
bcf.apply_freq_filter(freq)
a = bcf.nbest(BigramAssocMeasures.pmi, best)
self.bigramas = a
def set_trigramas(self,freq=2,best=20):
tcf = TrigramCollocationFinder.from_words(self.palavras)
stopset = set(stopwords.words('portuguese'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(freq)
a = tcf.nbest(TrigramAssocMeasures.pmi, best)
self.trigramas = a
def set_ngramas(self, n):
return ingrams(self.palavras, n)
def fd(self):
return FreqDist(self.palavras)
class Texto (object):
#divide texto em palavras
def __init__(self, texto):
self.texto = texto
self.tokens = False
self.palavras = False
self.frases = False
def set_tokens(self):
self.tokens = WordPunctTokenizer().tokenize(self.texto)
def set_palavras(self):
tokenizer = RegexpTokenizer('\w+')
self.palavras = tokenizer.tokenize(self.texto.lower())
def set_frases(self):
sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
self.frases = sent_tokenizer.tokenize(self.texto)
def similar(self, word, num=20):
if not self.tokens:
self.set_tokens()
if '_word_context_index' not in self.__dict__:
self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower())
# words = self._word_context_index.similar_words(word, num)
word = word.lower()
wci = self._word_context_index._word_to_contexts
if word in wci.conditions():
contexts = set(wci[word])
fd = FreqDist(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word)
words = fd.keys()[:num]
return words
else:
return False
def concordance(self, word, width=79, lines=25):
if not self.tokens:
self.set_tokens()
if '_concordance_index' not in self.__dict__:
self._concordance_index = ConcordanceIndex(self.tokens,key=lambda s:s.lower())
self._concordance_index.generate_concordance(word, width, lines)
return self._concordance_index.contexto
# NAO FUNCIONA
def common_contexts(self, words, num=20):
if not self.palavras:
self.set_palavras()
if '_word_context_index' not in self.__dict__:
self._word_context_index = ContextIndex(self.palavras, key=lambda s:s.lower())
try:
fd = self._word_context_index.common_contexts(words, True)
if not fd:
return "No common contexts were found"
else:
ranked_contexts = fd.keys()[:num]
return [(w1,w2) for w1,w2 in ranked_contexts]
except ValueError, e:
print e
_CONTEXT_RE = re.compile('\w+|[\.\!\?]')
def _context(self, tokens, i):
# Left context
j = i-1
while j>=0 and not self._CONTEXT_RE.match(tokens[j]):
j = j-1
if j == 0: left = '*START*'
else: left = tokens[j]
# Right context
j = i+1
while j<len(tokens) and not self._CONTEXT_RE.match(tokens[j]):
j = j+1
if j == len(tokens): right = '*END*'
else: right = tokens[j]
return (left, right)
class TokenSearcher(object):
"""
A class that makes it easier to use regular expressions to search
over tokenized strings. The tokenized string is converted to a
string where tokens are marked with angle brackets -- e.g.,
C{'<the><window><is><still><open>'}. The regular expression
passed to the L{findall()} method is modified to treat angle
brackets as nongrouping parentheses, in addition to matching the
token boundaries; and to have C{'.'} not match the angle brackets.
"""
def __init__(self, tokens):
self._raw = ''.join('<'+w+'>' for w in tokens)
def findall(self, regexp):
"""
Find instances of the regular expression in the text.
The text is a list of tokens, and a regexp pattern to match
a single token must be surrounded by angle brackets. E.g.
>>> ts.findall("<.*><.*><bro>")
['you rule bro', ['telling you bro; u twizted bro
>>> ts.findall("<a>(<.*>)<man>")
monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave
>>> text9.findall("<th.*>{3,}")
thread through those; the thought that; that the thing; the thing
that; that that thing; through these than through; them that the;
through the thick; them that they; thought that the
@param regexp: A regular expression
@type regexp: C{str}
"""
# preprocess the regular expression
regexp = re.sub(r'\s', '', regexp)
regexp = re.sub(r'<', '(?:<(?:', regexp)
regexp = re.sub(r'>', ')>)', regexp)
regexp = re.sub(r'(?<!\\)\.', '[^>]', regexp)
# perform the search
hits = re.findall(regexp, self._raw)
# Sanity check
for h in hits:
if not h.startswith('<') and h.endswith('>'):
raise ValueError('Bad regexp for TokenSearcher.findall')
# postprocess the output
hits = [h[1:-1].split('><') for h in hits]
return hits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment