Last active
January 30, 2024 14:51
-
-
Save kuotinyen/bc918b2a005b160b3f3846db3146b2dc to your computer and use it in GitHub Desktop.
20230109-SwiftRegex.swift
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
import XCTest | |
import RegexBuilder | |
// MARK: - Regex I | |
final class regTests: XCTestCase { | |
#warning("《Create a Regex》") | |
// 1. /<reg>/ -> build time | |
// 2. try! Regex(#"<reg>"#) -> runtime string | |
// 3. Regex Builder | |
#warning("《Case 1: Validation》") | |
func testValidation() { | |
// Both `let regex` is OK. | |
// let regex = try! Regex(#"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"#) | |
// let regex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/ | |
let matches = email.matches(of: emailRegex) | |
XCTAssertNotNil(matches.first) | |
} | |
#warning("《Case 2: Remove pattern》") | |
func testReplace() { | |
let expected = "blablabla</f>bla" | |
var target = "blablabla</f><v>1.23456789</v>bla" | |
let regex = #/<v>.*?</v>/# | |
// [equals to] let regex = #/<v>.*?</v>/# | |
target.replace(regex, with: "") | |
XCTAssertEqual(expected, target) | |
} | |
#warning("《Case 3: Replacing》") | |
func testReplacing() { | |
let regex = /<\/f><v>(\d+)元<\/v>/ | |
print("#### [Replacing] before html: \(html)") | |
let afterHtml = html.replacing(regex, with: "Hi", maxReplacements: 1) | |
print("#### [Replacing] after html: \(afterHtml)") | |
} | |
#warning("《Case 4: Yes, Optional.》") | |
func testOptionalRegex() { | |
let target = "Fancy a game of Cluedo™?" | |
let ranges = target.ranges(of: #/\bClue(do)?™?\b/#) // word boundary (whitespace, 標點符號, 句子頭尾) -> "<...> blabla", "blabla <...>", "blabla <...>?" -> !("<...>e") | |
XCTAssertNotNil(ranges.first) | |
let matchRange = target.range( | |
of: #"\bClue(do)?™?\b"#, | |
options: .regularExpression | |
) | |
// [Equals to] | |
// let matchRange = target.range( | |
// of: "\\bClue(do)?™?\\b", | |
// options: .regularExpression | |
// ) | |
XCTAssertNotNil(matchRange) | |
} | |
#warning("《Case 5: Regex builder and capture》") | |
// >>>>> </f><v>3元</v> | |
func testRegexBuilderAndCapture() { | |
// Update with capture and dynamic value | |
let regex = Regex { | |
"</f><v>" | |
Capture { OneOrMore(.digit) } | |
"元</v>" | |
} | |
// replaceSubrange will change html's range index | |
for match in html.matches(of: regex).reversed() { | |
// match.output -> ("</f><v>5元</v>", "5") | |
let value = (Int(match.1) ?? 0) * 10 | |
print("#### [testHTML] before html: \(html)") | |
html.replaceSubrange(match.range, with: "</f><v>\(value)元</v>") | |
print("#### [testHTML] after html: \(html)") | |
} | |
} | |
#warning("《Case 5: Skip escaping: #》") | |
func testSkipEscaping() { | |
let regex = #/ | |
</f><v>(\d+)元</v> | |
/# | |
let target = "</f><v>3元</v>" | |
let matches = target.matches(of: regex) | |
XCTAssertNotNil(matches.first) | |
} | |
#warning("《Case 6: Performace test》") // reg, NS, regex builder | |
func testPerformancePredicate() { | |
let emailPredicate = NSPredicate(format: "SELF MATCHES %@", emailPattern) | |
self.measure { | |
for _ in 0..<10000 { | |
_ = emailPredicate.evaluate(with: email) | |
} | |
} | |
} | |
func testPerformanceRegex() { | |
self.measure { | |
for _ in 0..<10000 { | |
let matches = email.matches(of: emailRegex) | |
XCTAssertNotNil(matches.first) | |
} | |
} | |
} | |
func testPerformanceRegexBuilder() { | |
let regexBuilder = Regex { | |
/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/ | |
} | |
self.measure { | |
for _ in 0..<10000 { | |
let matches = email.matches(of: regexBuilder) | |
XCTAssertNotNil(matches.first) | |
} | |
} | |
} | |
func testEscaplingSlashAndHashtag() { | |
let regex = | |
try! Regex(#""" | |
\d+/#\d+ | |
"""#) | |
XCTAssertEqual( | |
""" | |
3200/#44444 | |
""".matches(of:regex).count, | |
1 | |
) | |
} | |
} | |
// MARK: - Helpers | |
let email = "example@example.com" | |
let emailPattern = #"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"# | |
let emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/ | |
var html = """ | |
<html> | |
<head><title>價格列表</title></head> | |
<body> | |
<p>今天的特價品是:</p> | |
<div>蘋果: </f><v>10元</v></div> | |
<div>香蕉: </f><v>5元</v></div> | |
<p>歷史價格:</p> | |
<div>蘋果: <v>12元</v></div> | |
<div>香蕉: <v>6元</v></div> | |
</body> | |
</html> | |
""" | |
extension String { | |
mutating func remove(regex: Regex<Substring>) -> String { | |
replace(regex, with: "") | |
return self | |
} | |
} | |
// MARK: Regex I Summary | |
// 1. Use #<...># for Regex to avoid escaping characters. (String / Regex) | |
// 2. Regex Builder and Capture can catch matches `value`. | |
// 3. \b is for word boundary in Regex. | |
// 4. Use ? for optional parts and (target)? for specific words. | |
// 5. When using replaceSubrange, reverse the text first to ensure the next replaceSubrange applies to the correct range. | |
// 6. NSPredicate is faster in performance. | |
// MARK: - Regex II | |
extension regTests { | |
#warning("Matches method") | |
func testRegexMatchesMethods() { | |
// First match | |
XCTAssertEqual( | |
"abc123".firstMatch(of: /\d+/)?.output, | |
"123" | |
) | |
// Whole match | |
XCTAssertEqual( | |
"123abc".wholeMatch(of: /\d+/)?.output, | |
nil | |
) | |
// Prefix Match | |
XCTAssertEqual( | |
"123abc".prefixMatch(of: /\d+/)?.output, | |
"123" | |
) | |
// Starts with -> Bool | |
XCTAssertEqual( | |
"123abc".starts(with: /\d+/), | |
true | |
) | |
// Trimming | |
XCTAssertEqual( | |
"123777777abc".trimmingPrefix(/\d+/), | |
"abc" | |
) | |
// Replacing | |
XCTAssertEqual( | |
"123888888888abc".replacing(/\d+/, with: "777"), | |
"777abc" | |
) | |
// Split | |
XCTAssertEqual( | |
"123,abc".split(separator: /\s*,\s*/), | |
["123", "abc"] | |
) | |
} | |
#warning("Foundation date and currency") | |
func testRegexFoundationDate() { | |
let statement = """ | |
DSLIP 04/06/20 Paypal $3,020.85 | |
CREDIT 04/03/20 Payroll $69.73 | |
DEBIT 04/02/20 Rent ($38.25) | |
DEBIT 03/31/20 Grocery ($27.44) | |
DEBIT 03/24/20 IRS ($52,249.98) | |
""" | |
let regex = Regex { | |
OneOrMore(.word) | |
OneOrMore(.whitespace) | |
Capture( | |
.date( | |
format: "\(month: .twoDigits)/\(day: .twoDigits)/\(year: .twoDigits)", | |
locale: .current, | |
timeZone: .current | |
) | |
) | |
OneOrMore(.whitespace) | |
OneOrMore(.word) | |
OneOrMore(.whitespace) | |
Capture( | |
.localizedCurrency(code: "USD", locale: .current) | |
.sign(strategy: .accounting) | |
) | |
} | |
let matches = statement.matches(of: regex) | |
XCTAssertEqual(matches.count, 5) | |
let firstBill = matches.first?.output | |
XCTAssertEqual(firstBill?.0, "DSLIP 04/06/20 Paypal $3,020.85") | |
XCTAssertEqual(firstBill?.1, Date("2020-04-05 16:00:00 +0000")) | |
XCTAssertEqual(firstBill?.2, 3020.85) | |
} | |
#warning("Capture without Regex Builder") | |
// 1. Capture pattern: \d{2} | |
// 2. Capture group: | |
// a. (\d{2}) | |
// b. ?<name>pattern | |
func testRegexCaptureWithoutRegexBuilder() { | |
let match1 = "Hello WWDC22!".firstMatch(of: /Hello WWDC(\d{2})!/) | |
XCTAssertEqual("22", match1?.1 ?? "none") | |
let match2 = "Hello WWDC22!".firstMatch(of: /Hello WWDC(?<year>\d{2})!/) | |
XCTAssertEqual("22", match2?.year ?? "none") | |
let match3 = "Hello WWDC2024".firstMatch(of: Regex { | |
Repeat(count: 4) { .digit } | |
}) | |
XCTAssertEqual("2024", match3?.output ?? "none") | |
} | |
#warning("Repeat Capture and Behavior") | |
// 1. ChoiceOf -> switch case | |
// 2. Repeat capture: reluctant | |
// 3. transform | |
// 4. (WIP) TryCapture | |
// 5. Capture with date parser | |
func testLog() { | |
enum Status: Substring { | |
case started | |
case passed | |
case failed | |
} | |
let regex = Regex { | |
"Test Suite '" | |
Capture(/[a-zA-Z][a-zA-Z0-9]*/) | |
"' " | |
Capture { | |
// ChoiceOf: 很像用 enum case | |
ChoiceOf { | |
"started" | |
"passed" | |
"failed" | |
} | |
} transform: { | |
Status(rawValue: $0) | |
} | |
" at " | |
// a. capture digets | |
// Capture( | |
// OneOrMore(.any, .reluctant) | |
// ) | |
// Repetition: OneOrMore, ZeroOrMore, Reapeat, .eager by default 會吃掉下面的 "." .reluctant 表示會盡量 match 更少地字元. 每次都會中斷往後的 repeat, 優先看後面的宣告是否會符合. | |
// b. capture date | |
Capture( | |
.iso8601( | |
timeZone: .current, | |
includingFractionalSeconds: true, | |
dateTimeSeparator: .space | |
) | |
) | |
Optionally(".") | |
} | |
let testSuiteTestInputs = [ | |
"Test Suite 'RegexDSLTests' started at 2022-06-06 09:41:00.001", | |
"Test Suite 'RegexDSLTests' failed at 2022-06-06 09:41:00.001.", | |
"Test Suite 'RegexDSLTests' passed at 2022-06-06 09:41:00.001." | |
] | |
for line in testSuiteTestInputs { | |
// whole match | |
if let match = line.wholeMatch(of: regex) { | |
print("Matched: \(match.output)") | |
} | |
// Capture | |
if let (whole, name, status, dateTime) = line.wholeMatch(of: regex)?.output { | |
print("#### Matched: \"\(name)\", \"\(status)\", \"\(dateTime)\"") | |
} | |
} | |
} | |
#warning("Custom Parser") | |
func testDuration() { | |
let input = "Test Case '-[RegexDSLTests testCharacterClass]' passed (0.001 seconds)." | |
let regex = Regex { | |
"Test Case " | |
OneOrMore(.any, .reluctant) | |
"(" | |
Capture { | |
// .localizedDouble(locale: .current) | |
CDoubleParser() | |
} | |
" seconds)." | |
} | |
if let match = input.wholeMatch(of: regex) { | |
print("Time: \(match.output)") | |
} | |
} | |
func testRegexToBuilder() { | |
// Select > Right Click > Refactor > Convert to Regex Builder | |
let regex = Regex { | |
Capture { | |
Repeat(count: 4) { | |
One(.digit) | |
} | |
} | |
"-" | |
Capture { | |
Repeat(count: 2) { | |
One(.digit) | |
} | |
} | |
"-" | |
Capture { | |
Repeat(count: 2) { | |
One(.digit) | |
} | |
} | |
} | |
let firstMatch = "2024-01-23".firstMatch(of: regex) | |
XCTAssertEqual( | |
firstMatch?.output.1, | |
"2024" | |
) | |
} | |
} | |
import Darwin | |
struct CDoubleParser: CustomConsumingRegexComponent { | |
typealias RegexOutput = Double | |
func consuming( | |
_ input: String, startingAt index: String.Index, in bounds: Range<String.Index> | |
) throws -> (upperBound: String.Index, output: Double)? { | |
input[index...].withCString { startAddress in | |
var endAddress: UnsafeMutablePointer<CChar>! | |
let output = strtod(startAddress, &endAddress) | |
guard endAddress > startAddress else { return nil } | |
let parsedLength = startAddress.distance(to: endAddress) | |
let upperBound = input.utf8.index(index, offsetBy: parsedLength) | |
return (upperBound, output) | |
} | |
} | |
} | |
// MARK: - Regex II Summary | |
/* | |
- You can define custom parser | |
- Have builtin date, currency, double parser. | |
- Using Capture `transform` to extract enum cases from matches. | |
- Using .reluctant to take care of the pattern behind in Repeat check behavior. | |
- Using (...) to make pattern becomes capture group, using ?<name>pattern to assign name. | |
*/ | |
// MARK: - Private helper | |
extension Date { | |
init?(_ dateString: String) { | |
let dateFormatter = DateFormatter() | |
dateFormatter.dateFormat = "yyyy-MM-dd HH:mm:ss Z" | |
dateFormatter.timeZone = TimeZone(secondsFromGMT: 0) | |
if let date = dateFormatter.date(from: dateString) { | |
self = date | |
} else { | |
return nil | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment