Created
January 4, 2022 16:48
-
-
Save boulama/dba091b58fa4597a243795aa710d2a14 to your computer and use it in GitHub Desktop.
Efficiently cut sentences in PHP using RegEx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
separateSentences($text) { | |
$split_sentences = '%(?#!php/i split_sentences Rev:20160820_1800) | |
# Split sentences on whitespace between them. | |
# See: http://stackoverflow.com/a/5844564/433790 | |
(?<= # Sentence split location preceded by | |
[.!?] # either an end of sentence punct, | |
| [.!?][\'"] # or end of sentence punct and quote. | |
) # End positive lookbehind. | |
(?<! # But don\'t split after these: | |
Mr\. # Either "Mr." | |
| Mrs\. # Or "Mrs." | |
| Ms\. # Or "Ms." | |
| Jr\. # Or "Jr." | |
| Dr\. # Or "Dr." | |
| Prof\. # Or "Prof." | |
| Sr\. # Or "Sr." | |
| T\.V\.A\. # Or "T.V.A." | |
) # End negative lookbehind. | |
\s+ # Split on whitespace between sentences, | |
(?=\S) # (but not at end of string). | |
%xi'; // End $split_sentences. | |
$sentences = preg_split($split_sentences, $text, -1, PREG_SPLIT_NO_EMPTY); | |
foreach ($sentences as $key => $sentence) { | |
if(!preg_match("/ [a-zA-Z09]{1}(?=\.)/", $sentence)) { | |
if($key > 0) { | |
$sentences[$key-1] = trim($sentences[$key-1] . ' ' . $sentence, ' '); | |
$sentences[$key] = null; | |
} | |
} | |
} | |
$sentences = array_filter($sentences); | |
return $sentences; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment