Skip to content

Instantly share code, notes, and snippets.

@ApolloZhu
Created December 9, 2021 23:12
Show Gist options
  • Save ApolloZhu/aade1c11bcb78aec0f3b77828f94e3fa to your computer and use it in GitHub Desktop.
Save ApolloZhu/aade1c11bcb78aec0f3b77828f94e3fa to your computer and use it in GitHub Desktop.
Word Splitter
import Foundation
public enum Splitter {
/// https://stackoverflow.com/a/62527131
private static let wordBoundaries = #"([^\p{L}\d]+|(?<=\p{L})(?=\d)|(?<=\d)(?=\p{L})|(?<=[\p{Ll}\d])(?=\p{Lu})|(?<=\p{Lu})(?=\p{Lu}\p{Ll})|(?<=[\p{L}\d])(?=\p{Lu}\p{Ll}))"#
static func split(_ string: String) -> AsyncStream<String> {
AsyncStream<String> { continuation in
string
.enumerateSubstrings(in: string.startIndex..., options: .byWords) { (substring, _, _, _) in
guard let substring = substring else {
return
}
substring
.replacingOccurrences(of: Self.wordBoundaries, with: " ",
options: .regularExpression,
range: substring.startIndex..<substring.endIndex)
.split(separator: " ")
.forEach { word in
continuation.yield(String(word))
}
}
continuation.finish()
}
}
}
import XCTest
@testable import Splitter
final class SplitterTests: XCTestCase {
/// https://stackoverflow.com/questions/18379254/regex-to-split-camel-case
let cases = [
"MyCamelCaseString": ["My", "Camel", "Case", "String"],
"ExampleID": ["Example", "ID"],
"ExampleId": ["Example", "Id"],
"MyCamelCaseStringID": ["My", "Camel", "Case", "String", "ID"],
"myCamelCaseString": ["my", "Camel", "Case", "String"],
"PDFSplitAndMergeSamples": ["PDF", "Split", "And", "Merge", "Samples"],
"PDFExtractorSDKSamples": ["PDF", "Extractor", "SDK", "Samples"],
"PDFRendererSDKSamples": ["PDF", "Renderer", "SDK", "Samples"],
"BarcodeReaderSDKSamples": ["Barcode", "Reader", "SDK", "Samples"],
"camelCase": ["camel", "Case"],
"simple": ["simple"],
"number1Case2": ["number", "1", "Case", "2"],
"CamelCaseXYZ": ["Camel", "Case", "XYZ"],
"ThisIsASlug": ["This", "Is", "A", "Slug"],
"ABCMyCamelCaseSTR": ["ABC", "My", "Camel", "Case", "STR"],
"ThereIsWay_too MuchCGIInFilms These-days": ["There", "Is", "Way", "too", "Much", "CGI", "In", "Films", "These", "days"],
"UnicodeCanBeCAPITALISEDTooYouKnow": ["Unicode", "Can", "Be", "CAPITALISED", "Too", "You", "Know"],
"CAPITALLetters at the StartOfAString_work_too": ["CAPITAL", "Letters", "at", "the", "Start", "Of", "A", "String", "work", "too"],
"As_they_DoAtTheEND": ["As", "they", "Do", "At", "The", "END"],
"BitteWerfenSie-dieFußballeInDenMüll": ["Bitte", "Werfen", "Sie", "die", "Fußballe", "In", "Den", "Müll"],
"IchHabeUberGesagtNichtÜber": ["Ich", "Habe", "Uber", "Gesagt", "Nicht", "Über"],
"2BeOrNot2Be": ["2", "Be", "Or", "Not", "2", "Be"],
"ICannotBelieveThe100GotRenewed. It-isSOOOOOOBad": ["I", "Cannot", "Believe", "The", "100", "Got", "Renewed", "It", "is", "SOOOOOO", "Bad"],
"colName": ["col", "Name"],
"This_Is_A_title": ["This", "Is", "A", "title"],
"And_How_About_thisOne": ["And", "How", "About", "this", "One"],
"MaryHadALittleLamb": ["Mary", "Had", "A", "Little", "Lamb"],
"employeeID": ["employee", "ID"],
"N B A Finals": ["N", "B", "A", "Finals"],
"N B A Finals in L A": ["N", "B", "A", "Finals", "in", "L", "A"],
"I Love L A": ["I", "Love", "L", "A"],
"今天天气如何": ["今天", "天气", "如何"],
"我的englishSpeaking不是特别nice": ["我", "的", "english", "Speaking", "不", "是", "特别", "nice"],
]
func testExamples() async throws {
for (input, expected) in cases {
let actual = await Splitter.split(input).collected()
XCTAssertEqual(actual, expected)
}
}
}
/// https://www.hackingwithswift.com/quick-start/concurrency/how-to-convert-an-asyncsequence-into-a-sequence
extension AsyncSequence {
func collected() async rethrows -> [Element] {
try await reduce(into: [Element]()) { $0.append($1) }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment