macks22/newsgroup_corpus.py

## newsgroup_corpus.py
import os
import re
import sys

from gensim.corpora import textcorpus
from gensim import utils


class NewsgroupCorpus(textcorpus.TextDirectoryCorpus):

    def extract_body(self, text):
        return strip_newsgroup_header(
            strip_newsgroup_footer(
                strip_newsgroup_quoting(text)))

    def preprocess_text(self, text):
        body = self.extract_body(text)
        return super(NewsgroupCorpus, self).preprocess_text(body)


def strip_newsgroup_header(text):
    """
    Given text in "news" format, strip the headers, by removing everything
    before the first blank line.
    """
    _before, _blankline, after = text.partition('\n\n')
    return after


_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'
                       r'|^In article|^Quoted from|^\||^>)')


def strip_newsgroup_quoting(text):
    """
    Given text in "news" format, strip lines beginning with the quote
    characters > or |, plus lines that often introduce a quoted section
    (for example, because they contain the string 'writes:'.)
    """
    good_lines = [line for line in text.split('\n')
                  if not _QUOTE_RE.search(line)]
    return '\n'.join(good_lines)


_PGP_SIG_BEGIN = "-----BEGIN PGP SIGNATURE-----"


def strip_newsgroup_footer(text):
    """Given text in "news" format, attempt to remove a signature block."""
    try:
        return text[:text.index(_PGP_SIG_BEGIN)]
    except ValueError:
        return text


if __name__ == "__main__":
    data_path = sys.argv[1]
    corpus = NewsgroupCorpus(
        data_path, exclude_pattern="README.*",
        encoding_errors='ignore', encoding='ascii')
    fpath = os.path.join(data_path, 'alt.atheism', '51060')
    text = utils.smart_open(fpath).read()
    body = corpus.extract_body(text)
	import os
	import re
	import sys

	from gensim.corpora import textcorpus
	from gensim import utils


	class NewsgroupCorpus(textcorpus.TextDirectoryCorpus):

	def extract_body(self, text):
	return strip_newsgroup_header(
	strip_newsgroup_footer(
	strip_newsgroup_quoting(text)))

	def preprocess_text(self, text):
	body = self.extract_body(text)
	return super(NewsgroupCorpus, self).preprocess_text(body)


	def strip_newsgroup_header(text):
	"""
	Given text in "news" format, strip the headers, by removing everything
	before the first blank line.
	"""
	_before, _blankline, after = text.partition('\n\n')
	return after


	_QUOTE_RE = re.compile(r'(writes in\|writes:\|wrote:\|says:\|said:'
	r'\|^In article\|^Quoted from\|^\\|\|^>)')


	def strip_newsgroup_quoting(text):
	"""
	Given text in "news" format, strip lines beginning with the quote
	characters > or \|, plus lines that often introduce a quoted section
	(for example, because they contain the string 'writes:'.)
	"""
	good_lines = [line for line in text.split('\n')
	if not _QUOTE_RE.search(line)]
	return '\n'.join(good_lines)


	_PGP_SIG_BEGIN = "-----BEGIN PGP SIGNATURE-----"


	def strip_newsgroup_footer(text):
	"""Given text in "news" format, attempt to remove a signature block."""
	try:
	return text[:text.index(_PGP_SIG_BEGIN)]
	except ValueError:
	return text


	if __name__ == "__main__":
	data_path = sys.argv[1]
	corpus = NewsgroupCorpus(
	data_path, exclude_pattern="README.*",
	encoding_errors='ignore', encoding='ascii')
	fpath = os.path.join(data_path, 'alt.atheism', '51060')
	text = utils.smart_open(fpath).read()
	body = corpus.extract_body(text)