Skip to content

Instantly share code, notes, and snippets.

@macks22
Created June 4, 2017 12:38
Show Gist options
  • Save macks22/3754f132efc6bf6e54311c551ce675a8 to your computer and use it in GitHub Desktop.
Save macks22/3754f132efc6bf6e54311c551ce675a8 to your computer and use it in GitHub Desktop.
Provide a gensim-compatible corpus for the 20 newsgroup data, [based on this PR](https://github.com/RaRe-Technologies/gensim/pull/1388)
import os
import re
import sys
from gensim.corpora import textcorpus
from gensim import utils
class NewsgroupCorpus(textcorpus.TextDirectoryCorpus):
def extract_body(self, text):
return strip_newsgroup_header(
strip_newsgroup_footer(
strip_newsgroup_quoting(text)))
def preprocess_text(self, text):
body = self.extract_body(text)
return super(NewsgroupCorpus, self).preprocess_text(body)
def strip_newsgroup_header(text):
"""
Given text in "news" format, strip the headers, by removing everything
before the first blank line.
"""
_before, _blankline, after = text.partition('\n\n')
return after
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
r'|^In article|^Quoted from|^\||^>)')
def strip_newsgroup_quoting(text):
"""
Given text in "news" format, strip lines beginning with the quote
characters > or |, plus lines that often introduce a quoted section
(for example, because they contain the string 'writes:'.)
"""
good_lines = [line for line in text.split('\n')
if not _QUOTE_RE.search(line)]
return '\n'.join(good_lines)
_PGP_SIG_BEGIN = "-----BEGIN PGP SIGNATURE-----"
def strip_newsgroup_footer(text):
"""Given text in "news" format, attempt to remove a signature block."""
try:
return text[:text.index(_PGP_SIG_BEGIN)]
except ValueError:
return text
if __name__ == "__main__":
data_path = sys.argv[1]
corpus = NewsgroupCorpus(
data_path, exclude_pattern="README.*",
encoding_errors='ignore', encoding='ascii')
fpath = os.path.join(data_path, 'alt.atheism', '51060')
text = utils.smart_open(fpath).read()
body = corpus.extract_body(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment