TristynAlxander/spacy_paragraphs_pipeline.py

## spacy_paragraphs_pipeline.py
@spacy.language.Language.component("paragraph_parser")
def define_paragraphs(document):
  """
  DESCRIPTION:
    A SpaCy pipeline that adds a paragraphs parser (python generator) as a document attribute.
    The pipeline identifies paragraphs either by single-line-break or multi-line-break depending on what's used less frequently for sentences.
    The defined paragraphs include the whitespace tokens on either side to allow users to use the built-in sents parser.
  USAGE:
    nlp.add_pipe('paragraph_parser')
    doc = nlp(document_str)
    paragraph_list = list(doc.paragraphs)
  """

  # Paragraph-Break Private Variable
  spacy.tokens.doc.Doc.set_extension("_paragraph_break_",default=None,force=True)

  # Do most sentences start with break?
  sentences = list(document.sents)
  is_break_token = lambda token: token.is_space and token.text.count("\n") > 0
  sentences_starting_with_break = [ 1 for sentence in sentences if is_break_token(sentence[0]) or is_break_token(document[sentence[0].i-1]) ]
  sentences_start_with_break = len(sentences_starting_with_break)/len(sentences) > 0.5

  # Define Paragraph Breaks
  document._._paragraph_break_ =  1 if sentences_start_with_break else 0

  # Get Paragraphs Generator Function
  def get_paragraphs(doc):
    start = 0
    for token in doc:
      if token.is_space and token.text.count("\n") > doc._._paragraph_break_:
        yield doc[start:token.i+1]
        start = token.i
    yield doc[start:]

  # paragraphs public getter variable.
  spacy.tokens.doc.Doc.set_extension("paragraphs", getter=get_paragraphs,force=True)

  return document
	@spacy.language.Language.component("paragraph_parser")
	def define_paragraphs(document):
	"""
	DESCRIPTION:
	A SpaCy pipeline that adds a paragraphs parser (python generator) as a document attribute.
	The pipeline identifies paragraphs either by single-line-break or multi-line-break depending on what's used less frequently for sentences.
	The defined paragraphs include the whitespace tokens on either side to allow users to use the built-in sents parser.
	USAGE:
	nlp.add_pipe('paragraph_parser')
	doc = nlp(document_str)
	paragraph_list = list(doc.paragraphs)
	"""

	# Paragraph-Break Private Variable
	spacy.tokens.doc.Doc.set_extension("_paragraph_break_",default=None,force=True)

	# Do most sentences start with break?
	sentences = list(document.sents)
	is_break_token = lambda token: token.is_space and token.text.count("\n") > 0
	sentences_starting_with_break = [ 1 for sentence in sentences if is_break_token(sentence[0]) or is_break_token(document[sentence[0].i-1]) ]
	sentences_start_with_break = len(sentences_starting_with_break)/len(sentences) > 0.5

	# Define Paragraph Breaks
	document._._paragraph_break_ = 1 if sentences_start_with_break else 0

	# Get Paragraphs Generator Function
	def get_paragraphs(doc):
	start = 0
	for token in doc:
	if token.is_space and token.text.count("\n") > doc._._paragraph_break_:
	yield doc[start:token.i+1]
	start = token.i
	yield doc[start:]

	# paragraphs public getter variable.
	spacy.tokens.doc.Doc.set_extension("paragraphs", getter=get_paragraphs,force=True)

	return document