-
-
Save rtanglao/1137754 to your computer and use it in GitHub Desktop.
WIP-PythonCodeToGetBigGramsForGetSatisfactionDuplicationDetection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> bigrams(tokens) | |
[('Deleting', 'from'), ('from', 'All'), ('All', 'Mail'), ('Mail', 'I'), ('I', 'have'), | |
('have', 'over'), ('over', '600'), ('600', 'messages'), ('messages', 'in'), ('in', 'the'), | |
('the', 'ALL'), ('ALL', 'Mail'), ('Mail', 'box'), ('box', 'that'), ('that', 'I'), | |
('I', 'want'), ('want', 'to'), ('to', 'delete'), ('delete', 'but'), ('but', 'when'), | |
('when', 'I'), ('I', 'do'), ('do', 'they'), ('they', 'just'), ('just', 'keep'), | |
('keep', 'coming'), ('coming', 'back.'), ('back.', 'I'), ('I', 'have'), | |
('have', 'tried'), ('tried', 'compacting'), ('compacting', 'the'), ('the', 'folder'), | |
('folder', 'but'), ('but', 'that'), ('that', 'does'), ('does', 'not'), | |
('not', 'help.'), ('help.', 'Please'), ('Please', 'advise'), ('advise', 'how'), | |
('how', 'I'), ('I', 'can'), ('can', 'delete'), ('delete', 'messages'), | |
('messages', 'for'), ('for', 'ever!'), ('ever!', 'Thank'), ('Thank', 'you')] | |
>>> a # from http://getsatisfaction.com/mozilla_messaging/topics/deleting_from_all_mail | |
'Deleting from All Mail I have over 600 messages in the ALL Mail box that I want to | |
delete but when I do they just keep coming back. I have tried compacting the folder | |
but that does not help. Please advise how I can delete messages for ever! Thank you' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
>>> b # from getsatisfaction.com/mozilla_messaging/topics/deleting_from_all_mail-zlu37 | |
'This is the SECOND time I have asked the same question over the last couple of weeks | |
and not had any reply!! I cannot delete messages from ALL Mail; they just keep coming back | |
into the folder.I have tried compacting and it makes no difference. Sometimes the messages | |
do delete for a couple days but always come back.If I do not get a reply soon then I | |
will unistall and get rid of Thunderbird Thanx ' | |
>>> bigram_b = bigrams(b.split())>>> bigram_b | |
[('This', 'is'), ('is', 'the'), ('the', 'SECOND'), ('SECOND', 'time'), ('time', 'I'), | |
('I', 'have'), ('have', 'asked'), ('asked', 'the'), ('the', 'same'), | |
('same', 'question'), ('question', 'over'), ('over', 'the'), ('the', 'last'), | |
('last', 'couple'), ('couple', 'of'), ('of', 'weeks'), ('weeks', 'and'), | |
('and', 'not'), ('not', 'had'), ('had', 'any'), ('any', 'reply!!'), | |
('reply!!', 'I'), ('I', 'cannot'), ('cannot', 'delete'), ('delete', 'messages'), | |
('messages', 'from'), ('from', 'ALL'), ('ALL', 'Mail;'), ('Mail;', 'they'), | |
('they', 'just'), ('just', 'keep'), ('keep', 'coming'), ('coming', 'back'), | |
('back', 'into'), ('into', 'the'), ('the', 'folder.I'), ('folder.I', 'have'), | |
('have', 'tried'), ('tried', 'compacting'), ('compacting', 'and'), ('and', 'it'), | |
('it', 'makes'), ('makes', 'no'), ('no', 'difference.'), | |
('difference.', 'Sometimes'), ('Sometimes', 'the'), ('the', 'messages'), | |
('messages', 'do'), ('do', 'delete'), ('delete', 'for'), ('for', 'a'), | |
('a', 'couple'), ('couple', 'days'), ('days', 'but'), ('but', 'always'), | |
('always', 'come'), ('come', 'back.If'), ('back.If', 'I'), ('I', 'do'), | |
('do', 'not'), ('not', 'get'), ('get', 'a'), ('a', 'reply'), ('reply', 'soon'), | |
('soon', 'then'), ('then', 'I'), ('I', 'will'), ('will', 'unistall'), | |
('unistall', 'and'), ('and', 'get'), ('get', 'rid'), ('rid', 'of'), ('of', 'Thunderbird'), ('Thunderbird', 'Thanx')] | |
>>> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get GS topics from MongoDB | |
# split into words | |
#remove html tags | |
# code from http://stackoverflow.com/questions/37486/filter-out-html-tags-and-resolve-entities-in-python | |
import lxml.html | |
t = lxml.html.fromstring("...") | |
t.text_content() | |
# calculate biggrams | |
# (see jaggu's blog post: | |
# http://jaganadhg.freeflux.net/blog/?q=bigram | |
a = "Jaganadh is testing this application" | |
# Creating a string for generating bi-grams | |
tokens = a.split() | |
from nltk import bigrams | |
bigrams(tokens) | |
#loop through biggrams and if > 75% of the biggrams match, then it's a duplicate | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment