Skip to content

Instantly share code, notes, and snippets.

@rafaismyname
Created December 22, 2016 19:36
Show Gist options
  • Save rafaismyname/1579f247fe113a7d24e6dd034aa7b580 to your computer and use it in GitHub Desktop.
Save rafaismyname/1579f247fe113a7d24e6dd034aa7b580 to your computer and use it in GitHub Desktop.
Find most common sentences
# -*- coding: utf-8 -*-
import sys
from csv import reader as csv_reader
from string import punctuation
from nltk import sent_tokenize, word_tokenize
from nltk.util import ngrams
from collections import Counter
# init config vars
lang = "portuguese"
encode = "utf8"
csv_file_path = "tickets.csv"
csv_delimiter = ","
csv_phrase_index = 0
csv_skip_header = True
ngram_base_length = 3 # numbers of words that compose a sentence
reduce_ngram = False
increase_ngram = False
acceptable_frequency = 5
# consts
phrase_counter = Counter()
# set default encoding
reload(sys)
sys.setdefaultencoding(encode)
def untokenize(the_ngram):
tokens = list(the_ngram)
return "".join([" " + i if (i not in punctuation) else i for i in tokens]).strip()
def extract_phrases(text, length):
for sent in sent_tokenize(text, lang):
words = word_tokenize(sent, lang)
for phrase in ngrams(words, length):
if all(word not in punctuation for word in phrase):
phrase_counter[untokenize(phrase)] += 1
def add_sentence(sentence):
extract_phrases(sentence.lower(), ngram_base_length)
if reduce_ngram:
extract_phrases(sentence, ngram_base_length - 1)
if increase_ngram:
extract_phrases(sentence, ngram_base_length + 1)
with open(csv_file_path, "r") as csv_buffer:
reader = csv_reader(csv_buffer, delimiter=csv_delimiter)
if csv_skip_header:
reader.next()
[add_sentence(line[csv_phrase_index].lower()) for line in reader]
for k,v in phrase_counter.most_common():
if v >= acceptable_frequency:
print '{0: <5}'.format(v), k
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment