Skip to content

Instantly share code, notes, and snippets.

@rtanglao
Created August 10, 2011 18:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rtanglao/1137754 to your computer and use it in GitHub Desktop.
Save rtanglao/1137754 to your computer and use it in GitHub Desktop.
WIP-PythonCodeToGetBigGramsForGetSatisfactionDuplicationDetection
>>> bigrams(tokens)
[('Deleting', 'from'), ('from', 'All'), ('All', 'Mail'), ('Mail', 'I'), ('I', 'have'),
('have', 'over'), ('over', '600'), ('600', 'messages'), ('messages', 'in'), ('in', 'the'),
('the', 'ALL'), ('ALL', 'Mail'), ('Mail', 'box'), ('box', 'that'), ('that', 'I'),
('I', 'want'), ('want', 'to'), ('to', 'delete'), ('delete', 'but'), ('but', 'when'),
('when', 'I'), ('I', 'do'), ('do', 'they'), ('they', 'just'), ('just', 'keep'),
('keep', 'coming'), ('coming', 'back.'), ('back.', 'I'), ('I', 'have'),
('have', 'tried'), ('tried', 'compacting'), ('compacting', 'the'), ('the', 'folder'),
('folder', 'but'), ('but', 'that'), ('that', 'does'), ('does', 'not'),
('not', 'help.'), ('help.', 'Please'), ('Please', 'advise'), ('advise', 'how'),
('how', 'I'), ('I', 'can'), ('can', 'delete'), ('delete', 'messages'),
('messages', 'for'), ('for', 'ever!'), ('ever!', 'Thank'), ('Thank', 'you')]
>>> a # from http://getsatisfaction.com/mozilla_messaging/topics/deleting_from_all_mail
'Deleting from All Mail I have over 600 messages in the ALL Mail box that I want to
delete but when I do they just keep coming back. I have tried compacting the folder
but that does not help. Please advise how I can delete messages for ever! Thank you'
>>> b # from getsatisfaction.com/mozilla_messaging/topics/deleting_from_all_mail-zlu37
'This is the SECOND time I have asked the same question over the last couple of weeks
and not had any reply!! I cannot delete messages from ALL Mail; they just keep coming back
into the folder.I have tried compacting and it makes no difference. Sometimes the messages
do delete for a couple days but always come back.If I do not get a reply soon then I
will unistall and get rid of Thunderbird Thanx '
>>> bigram_b = bigrams(b.split())>>> bigram_b
[('This', 'is'), ('is', 'the'), ('the', 'SECOND'), ('SECOND', 'time'), ('time', 'I'),
('I', 'have'), ('have', 'asked'), ('asked', 'the'), ('the', 'same'),
('same', 'question'), ('question', 'over'), ('over', 'the'), ('the', 'last'),
('last', 'couple'), ('couple', 'of'), ('of', 'weeks'), ('weeks', 'and'),
('and', 'not'), ('not', 'had'), ('had', 'any'), ('any', 'reply!!'),
('reply!!', 'I'), ('I', 'cannot'), ('cannot', 'delete'), ('delete', 'messages'),
('messages', 'from'), ('from', 'ALL'), ('ALL', 'Mail;'), ('Mail;', 'they'),
('they', 'just'), ('just', 'keep'), ('keep', 'coming'), ('coming', 'back'),
('back', 'into'), ('into', 'the'), ('the', 'folder.I'), ('folder.I', 'have'),
('have', 'tried'), ('tried', 'compacting'), ('compacting', 'and'), ('and', 'it'),
('it', 'makes'), ('makes', 'no'), ('no', 'difference.'),
('difference.', 'Sometimes'), ('Sometimes', 'the'), ('the', 'messages'),
('messages', 'do'), ('do', 'delete'), ('delete', 'for'), ('for', 'a'),
('a', 'couple'), ('couple', 'days'), ('days', 'but'), ('but', 'always'),
('always', 'come'), ('come', 'back.If'), ('back.If', 'I'), ('I', 'do'),
('do', 'not'), ('not', 'get'), ('get', 'a'), ('a', 'reply'), ('reply', 'soon'),
('soon', 'then'), ('then', 'I'), ('I', 'will'), ('will', 'unistall'),
('unistall', 'and'), ('and', 'get'), ('get', 'rid'), ('rid', 'of'), ('of', 'Thunderbird'), ('Thunderbird', 'Thanx')]
>>>
# get GS topics from MongoDB
# split into words
#remove html tags
# code from http://stackoverflow.com/questions/37486/filter-out-html-tags-and-resolve-entities-in-python
import lxml.html
t = lxml.html.fromstring("...")
t.text_content()
# calculate biggrams
# (see jaggu's blog post:
# http://jaganadhg.freeflux.net/blog/?q=bigram
a = "Jaganadh is testing this application"
# Creating a string for generating bi-grams
tokens = a.split()
from nltk import bigrams
bigrams(tokens)
#loop through biggrams and if > 75% of the biggrams match, then it's a duplicate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment