Created
December 9, 2021 23:12
-
-
Save ApolloZhu/aade1c11bcb78aec0f3b77828f94e3fa to your computer and use it in GitHub Desktop.
Word Splitter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
public enum Splitter { | |
/// https://stackoverflow.com/a/62527131 | |
private static let wordBoundaries = #"([^\p{L}\d]+|(?<=\p{L})(?=\d)|(?<=\d)(?=\p{L})|(?<=[\p{Ll}\d])(?=\p{Lu})|(?<=\p{Lu})(?=\p{Lu}\p{Ll})|(?<=[\p{L}\d])(?=\p{Lu}\p{Ll}))"# | |
static func split(_ string: String) -> AsyncStream<String> { | |
AsyncStream<String> { continuation in | |
string | |
.enumerateSubstrings(in: string.startIndex..., options: .byWords) { (substring, _, _, _) in | |
guard let substring = substring else { | |
return | |
} | |
substring | |
.replacingOccurrences(of: Self.wordBoundaries, with: " ", | |
options: .regularExpression, | |
range: substring.startIndex..<substring.endIndex) | |
.split(separator: " ") | |
.forEach { word in | |
continuation.yield(String(word)) | |
} | |
} | |
continuation.finish() | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import XCTest | |
@testable import Splitter | |
final class SplitterTests: XCTestCase { | |
/// https://stackoverflow.com/questions/18379254/regex-to-split-camel-case | |
let cases = [ | |
"MyCamelCaseString": ["My", "Camel", "Case", "String"], | |
"ExampleID": ["Example", "ID"], | |
"ExampleId": ["Example", "Id"], | |
"MyCamelCaseStringID": ["My", "Camel", "Case", "String", "ID"], | |
"myCamelCaseString": ["my", "Camel", "Case", "String"], | |
"PDFSplitAndMergeSamples": ["PDF", "Split", "And", "Merge", "Samples"], | |
"PDFExtractorSDKSamples": ["PDF", "Extractor", "SDK", "Samples"], | |
"PDFRendererSDKSamples": ["PDF", "Renderer", "SDK", "Samples"], | |
"BarcodeReaderSDKSamples": ["Barcode", "Reader", "SDK", "Samples"], | |
"camelCase": ["camel", "Case"], | |
"simple": ["simple"], | |
"number1Case2": ["number", "1", "Case", "2"], | |
"CamelCaseXYZ": ["Camel", "Case", "XYZ"], | |
"ThisIsASlug": ["This", "Is", "A", "Slug"], | |
"ABCMyCamelCaseSTR": ["ABC", "My", "Camel", "Case", "STR"], | |
"ThereIsWay_too MuchCGIInFilms These-days": ["There", "Is", "Way", "too", "Much", "CGI", "In", "Films", "These", "days"], | |
"UnicodeCanBeCAPITALISEDTooYouKnow": ["Unicode", "Can", "Be", "CAPITALISED", "Too", "You", "Know"], | |
"CAPITALLetters at the StartOfAString_work_too": ["CAPITAL", "Letters", "at", "the", "Start", "Of", "A", "String", "work", "too"], | |
"As_they_DoAtTheEND": ["As", "they", "Do", "At", "The", "END"], | |
"BitteWerfenSie-dieFußballeInDenMüll": ["Bitte", "Werfen", "Sie", "die", "Fußballe", "In", "Den", "Müll"], | |
"IchHabeUberGesagtNichtÜber": ["Ich", "Habe", "Uber", "Gesagt", "Nicht", "Über"], | |
"2BeOrNot2Be": ["2", "Be", "Or", "Not", "2", "Be"], | |
"ICannotBelieveThe100GotRenewed. It-isSOOOOOOBad": ["I", "Cannot", "Believe", "The", "100", "Got", "Renewed", "It", "is", "SOOOOOO", "Bad"], | |
"colName": ["col", "Name"], | |
"This_Is_A_title": ["This", "Is", "A", "title"], | |
"And_How_About_thisOne": ["And", "How", "About", "this", "One"], | |
"MaryHadALittleLamb": ["Mary", "Had", "A", "Little", "Lamb"], | |
"employeeID": ["employee", "ID"], | |
"N B A Finals": ["N", "B", "A", "Finals"], | |
"N B A Finals in L A": ["N", "B", "A", "Finals", "in", "L", "A"], | |
"I Love L A": ["I", "Love", "L", "A"], | |
"今天天气如何": ["今天", "天气", "如何"], | |
"我的englishSpeaking不是特别nice": ["我", "的", "english", "Speaking", "不", "是", "特别", "nice"], | |
] | |
func testExamples() async throws { | |
for (input, expected) in cases { | |
let actual = await Splitter.split(input).collected() | |
XCTAssertEqual(actual, expected) | |
} | |
} | |
} | |
/// https://www.hackingwithswift.com/quick-start/concurrency/how-to-convert-an-asyncsequence-into-a-sequence | |
extension AsyncSequence { | |
func collected() async rethrows -> [Element] { | |
try await reduce(into: [Element]()) { $0.append($1) } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment