Created
June 4, 2017 12:38
-
-
Save macks22/3754f132efc6bf6e54311c551ce675a8 to your computer and use it in GitHub Desktop.
Provide a gensim-compatible corpus for the 20 newsgroup data, [based on this PR](https://github.com/RaRe-Technologies/gensim/pull/1388)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import sys | |
from gensim.corpora import textcorpus | |
from gensim import utils | |
class NewsgroupCorpus(textcorpus.TextDirectoryCorpus): | |
def extract_body(self, text): | |
return strip_newsgroup_header( | |
strip_newsgroup_footer( | |
strip_newsgroup_quoting(text))) | |
def preprocess_text(self, text): | |
body = self.extract_body(text) | |
return super(NewsgroupCorpus, self).preprocess_text(body) | |
def strip_newsgroup_header(text): | |
""" | |
Given text in "news" format, strip the headers, by removing everything | |
before the first blank line. | |
""" | |
_before, _blankline, after = text.partition('\n\n') | |
return after | |
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:' | |
r'|^In article|^Quoted from|^\||^>)') | |
def strip_newsgroup_quoting(text): | |
""" | |
Given text in "news" format, strip lines beginning with the quote | |
characters > or |, plus lines that often introduce a quoted section | |
(for example, because they contain the string 'writes:'.) | |
""" | |
good_lines = [line for line in text.split('\n') | |
if not _QUOTE_RE.search(line)] | |
return '\n'.join(good_lines) | |
_PGP_SIG_BEGIN = "-----BEGIN PGP SIGNATURE-----" | |
def strip_newsgroup_footer(text): | |
"""Given text in "news" format, attempt to remove a signature block.""" | |
try: | |
return text[:text.index(_PGP_SIG_BEGIN)] | |
except ValueError: | |
return text | |
if __name__ == "__main__": | |
data_path = sys.argv[1] | |
corpus = NewsgroupCorpus( | |
data_path, exclude_pattern="README.*", | |
encoding_errors='ignore', encoding='ascii') | |
fpath = os.path.join(data_path, 'alt.atheism', '51060') | |
text = utils.smart_open(fpath).read() | |
body = corpus.extract_body(text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment