Skip to content

Instantly share code, notes, and snippets.

@yaronv
Created September 2, 2018 07:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yaronv/2caa3919f27d6567dc8fed483a171bd3 to your computer and use it in GitHub Desktop.
Save yaronv/2caa3919f27d6567dc8fed483a171bd3 to your computer and use it in GitHub Desktop.
from gensim.parsing.porter import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
class MyCorpus():
def __init__(self, train_data):
self.train_data = train_data
def __iter__(self):
p = PorterStemmer()
for i in range(len(self.train_data)):
doc = self.train_data['text'][i]
doc = re.sub(r'\S*@\S*\s?', '', doc, flags=re.MULTILINE) # remove email
doc = re.sub(r'http\S+', '', doc, flags=re.MULTILINE) # remove web addresses
doc = re.sub("\'", "", doc) # remove single quotes
doc = remove_stopwords(doc)
doc = p.stem_sentence(doc)
words = simple_preprocess(doc, deacc=True)
yield TaggedDocument(words=words, tags=[self.train_data['documentId'][i]])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment