Last active
October 25, 2023 00:06
-
-
Save TristynAlxander/7f3d2eb6168ae1ee90525cd262c726dd to your computer and use it in GitHub Desktop.
A SpaCy pipeline that adds a paragraphs parser (python generator) as a document attribute.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@spacy.language.Language.component("paragraph_parser") | |
def define_paragraphs(document): | |
""" | |
DESCRIPTION: | |
A SpaCy pipeline that adds a paragraphs parser (python generator) as a document attribute. | |
The pipeline identifies paragraphs either by single-line-break or multi-line-break depending on what's used less frequently for sentences. | |
The defined paragraphs include the whitespace tokens on either side to allow users to use the built-in sents parser. | |
USAGE: | |
nlp.add_pipe('paragraph_parser') | |
doc = nlp(document_str) | |
paragraph_list = list(doc.paragraphs) | |
""" | |
# Paragraph-Break Private Variable | |
spacy.tokens.doc.Doc.set_extension("_paragraph_break_",default=None,force=True) | |
# Do most sentences start with break? | |
sentences = list(document.sents) | |
is_break_token = lambda token: token.is_space and token.text.count("\n") > 0 | |
sentences_starting_with_break = [ 1 for sentence in sentences if is_break_token(sentence[0]) or is_break_token(document[sentence[0].i-1]) ] | |
sentences_start_with_break = len(sentences_starting_with_break)/len(sentences) > 0.5 | |
# Define Paragraph Breaks | |
document._._paragraph_break_ = 1 if sentences_start_with_break else 0 | |
# Get Paragraphs Generator Function | |
def get_paragraphs(doc): | |
start = 0 | |
for token in doc: | |
if token.is_space and token.text.count("\n") > doc._._paragraph_break_: | |
yield doc[start:token.i+1] | |
start = token.i | |
yield doc[start:] | |
# paragraphs public getter variable. | |
spacy.tokens.doc.Doc.set_extension("paragraphs", getter=get_paragraphs,force=True) | |
return document |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment