Skip to content

Instantly share code, notes, and snippets.

@kuotinyen
Last active January 30, 2024 14:51
Show Gist options
  • Save kuotinyen/bc918b2a005b160b3f3846db3146b2dc to your computer and use it in GitHub Desktop.
Save kuotinyen/bc918b2a005b160b3f3846db3146b2dc to your computer and use it in GitHub Desktop.
20230109-SwiftRegex.swift
import Foundation
import XCTest
import RegexBuilder
// MARK: - Regex I
final class regTests: XCTestCase {
#warning("《Create a Regex》")
// 1. /<reg>/ -> build time
// 2. try! Regex(#"<reg>"#) -> runtime string
// 3. Regex Builder
#warning("《Case 1: Validation》")
func testValidation() {
// Both `let regex` is OK.
// let regex = try! Regex(#"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"#)
// let regex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/
let matches = email.matches(of: emailRegex)
XCTAssertNotNil(matches.first)
}
#warning("《Case 2: Remove pattern》")
func testReplace() {
let expected = "blablabla</f>bla"
var target = "blablabla</f><v>1.23456789</v>bla"
let regex = #/<v>.*?</v>/#
// [equals to] let regex = #/<v>.*?</v>/#
target.replace(regex, with: "")
XCTAssertEqual(expected, target)
}
#warning("《Case 3: Replacing》")
func testReplacing() {
let regex = /<\/f><v>(\d+)元<\/v>/
print("#### [Replacing] before html: \(html)")
let afterHtml = html.replacing(regex, with: "Hi", maxReplacements: 1)
print("#### [Replacing] after html: \(afterHtml)")
}
#warning("《Case 4: Yes, Optional.》")
func testOptionalRegex() {
let target = "Fancy a game of Cluedo™?"
let ranges = target.ranges(of: #/\bClue(do)?™?\b/#) // word boundary (whitespace, 標點符號, 句子頭尾) -> "<...> blabla", "blabla <...>", "blabla <...>?" -> !("<...>e")
XCTAssertNotNil(ranges.first)
let matchRange = target.range(
of: #"\bClue(do)?™?\b"#,
options: .regularExpression
)
// [Equals to]
// let matchRange = target.range(
// of: "\\bClue(do)?™?\\b",
// options: .regularExpression
// )
XCTAssertNotNil(matchRange)
}
#warning("《Case 5: Regex builder and capture》")
// >>>>> </f><v>3元</v>
func testRegexBuilderAndCapture() {
// Update with capture and dynamic value
let regex = Regex {
"</f><v>"
Capture { OneOrMore(.digit) }
"元</v>"
}
// replaceSubrange will change html's range index
for match in html.matches(of: regex).reversed() {
// match.output -> ("</f><v>5元</v>", "5")
let value = (Int(match.1) ?? 0) * 10
print("#### [testHTML] before html: \(html)")
html.replaceSubrange(match.range, with: "</f><v>\(value)元</v>")
print("#### [testHTML] after html: \(html)")
}
}
#warning("《Case 5: Skip escaping: #》")
func testSkipEscaping() {
let regex = #/
</f><v>(\d+)元</v>
/#
let target = "</f><v>3元</v>"
let matches = target.matches(of: regex)
XCTAssertNotNil(matches.first)
}
#warning("《Case 6: Performace test》") // reg, NS, regex builder
func testPerformancePredicate() {
let emailPredicate = NSPredicate(format: "SELF MATCHES %@", emailPattern)
self.measure {
for _ in 0..<10000 {
_ = emailPredicate.evaluate(with: email)
}
}
}
func testPerformanceRegex() {
self.measure {
for _ in 0..<10000 {
let matches = email.matches(of: emailRegex)
XCTAssertNotNil(matches.first)
}
}
}
func testPerformanceRegexBuilder() {
let regexBuilder = Regex {
/^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/
}
self.measure {
for _ in 0..<10000 {
let matches = email.matches(of: regexBuilder)
XCTAssertNotNil(matches.first)
}
}
}
func testEscaplingSlashAndHashtag() {
let regex =
try! Regex(#"""
\d+/#\d+
"""#)
XCTAssertEqual(
"""
3200/#44444
""".matches(of:regex).count,
1
)
}
}
// MARK: - Helpers
let email = "example@example.com"
let emailPattern = #"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"#
let emailRegex = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/
var html = """
<html>
<head><title>價格列表</title></head>
<body>
<p>今天的特價品是:</p>
<div>蘋果: </f><v>10元</v></div>
<div>香蕉: </f><v>5元</v></div>
<p>歷史價格:</p>
<div>蘋果: <v>12元</v></div>
<div>香蕉: <v>6元</v></div>
</body>
</html>
"""
extension String {
mutating func remove(regex: Regex<Substring>) -> String {
replace(regex, with: "")
return self
}
}
// MARK: Regex I Summary
// 1. Use #<...># for Regex to avoid escaping characters. (String / Regex)
// 2. Regex Builder and Capture can catch matches `value`.
// 3. \b is for word boundary in Regex.
// 4. Use ? for optional parts and (target)? for specific words.
// 5. When using replaceSubrange, reverse the text first to ensure the next replaceSubrange applies to the correct range.
// 6. NSPredicate is faster in performance.
// MARK: - Regex II
extension regTests {
#warning("Matches method")
func testRegexMatchesMethods() {
// First match
XCTAssertEqual(
"abc123".firstMatch(of: /\d+/)?.output,
"123"
)
// Whole match
XCTAssertEqual(
"123abc".wholeMatch(of: /\d+/)?.output,
nil
)
// Prefix Match
XCTAssertEqual(
"123abc".prefixMatch(of: /\d+/)?.output,
"123"
)
// Starts with -> Bool
XCTAssertEqual(
"123abc".starts(with: /\d+/),
true
)
// Trimming
XCTAssertEqual(
"123777777abc".trimmingPrefix(/\d+/),
"abc"
)
// Replacing
XCTAssertEqual(
"123888888888abc".replacing(/\d+/, with: "777"),
"777abc"
)
// Split
XCTAssertEqual(
"123,abc".split(separator: /\s*,\s*/),
["123", "abc"]
)
}
#warning("Foundation date and currency")
func testRegexFoundationDate() {
let statement = """
DSLIP 04/06/20 Paypal $3,020.85
CREDIT 04/03/20 Payroll $69.73
DEBIT 04/02/20 Rent ($38.25)
DEBIT 03/31/20 Grocery ($27.44)
DEBIT 03/24/20 IRS ($52,249.98)
"""
let regex = Regex {
OneOrMore(.word)
OneOrMore(.whitespace)
Capture(
.date(
format: "\(month: .twoDigits)/\(day: .twoDigits)/\(year: .twoDigits)",
locale: .current,
timeZone: .current
)
)
OneOrMore(.whitespace)
OneOrMore(.word)
OneOrMore(.whitespace)
Capture(
.localizedCurrency(code: "USD", locale: .current)
.sign(strategy: .accounting)
)
}
let matches = statement.matches(of: regex)
XCTAssertEqual(matches.count, 5)
let firstBill = matches.first?.output
XCTAssertEqual(firstBill?.0, "DSLIP 04/06/20 Paypal $3,020.85")
XCTAssertEqual(firstBill?.1, Date("2020-04-05 16:00:00 +0000"))
XCTAssertEqual(firstBill?.2, 3020.85)
}
#warning("Capture without Regex Builder")
// 1. Capture pattern: \d{2}
// 2. Capture group:
// a. (\d{2})
// b. ?<name>pattern
func testRegexCaptureWithoutRegexBuilder() {
let match1 = "Hello WWDC22!".firstMatch(of: /Hello WWDC(\d{2})!/)
XCTAssertEqual("22", match1?.1 ?? "none")
let match2 = "Hello WWDC22!".firstMatch(of: /Hello WWDC(?<year>\d{2})!/)
XCTAssertEqual("22", match2?.year ?? "none")
let match3 = "Hello WWDC2024".firstMatch(of: Regex {
Repeat(count: 4) { .digit }
})
XCTAssertEqual("2024", match3?.output ?? "none")
}
#warning("Repeat Capture and Behavior")
// 1. ChoiceOf -> switch case
// 2. Repeat capture: reluctant
// 3. transform
// 4. (WIP) TryCapture
// 5. Capture with date parser
func testLog() {
enum Status: Substring {
case started
case passed
case failed
}
let regex = Regex {
"Test Suite '"
Capture(/[a-zA-Z][a-zA-Z0-9]*/)
"' "
Capture {
// ChoiceOf: 很像用 enum case
ChoiceOf {
"started"
"passed"
"failed"
}
} transform: {
Status(rawValue: $0)
}
" at "
// a. capture digets
// Capture(
// OneOrMore(.any, .reluctant)
// )
// Repetition: OneOrMore, ZeroOrMore, Reapeat, .eager by default 會吃掉下面的 "." .reluctant 表示會盡量 match 更少地字元. 每次都會中斷往後的 repeat, 優先看後面的宣告是否會符合.
// b. capture date
Capture(
.iso8601(
timeZone: .current,
includingFractionalSeconds: true,
dateTimeSeparator: .space
)
)
Optionally(".")
}
let testSuiteTestInputs = [
"Test Suite 'RegexDSLTests' started at 2022-06-06 09:41:00.001",
"Test Suite 'RegexDSLTests' failed at 2022-06-06 09:41:00.001.",
"Test Suite 'RegexDSLTests' passed at 2022-06-06 09:41:00.001."
]
for line in testSuiteTestInputs {
// whole match
if let match = line.wholeMatch(of: regex) {
print("Matched: \(match.output)")
}
// Capture
if let (whole, name, status, dateTime) = line.wholeMatch(of: regex)?.output {
print("#### Matched: \"\(name)\", \"\(status)\", \"\(dateTime)\"")
}
}
}
#warning("Custom Parser")
func testDuration() {
let input = "Test Case '-[RegexDSLTests testCharacterClass]' passed (0.001 seconds)."
let regex = Regex {
"Test Case "
OneOrMore(.any, .reluctant)
"("
Capture {
// .localizedDouble(locale: .current)
CDoubleParser()
}
" seconds)."
}
if let match = input.wholeMatch(of: regex) {
print("Time: \(match.output)")
}
}
func testRegexToBuilder() {
// Select > Right Click > Refactor > Convert to Regex Builder
let regex = Regex {
Capture {
Repeat(count: 4) {
One(.digit)
}
}
"-"
Capture {
Repeat(count: 2) {
One(.digit)
}
}
"-"
Capture {
Repeat(count: 2) {
One(.digit)
}
}
}
let firstMatch = "2024-01-23".firstMatch(of: regex)
XCTAssertEqual(
firstMatch?.output.1,
"2024"
)
}
}
import Darwin
struct CDoubleParser: CustomConsumingRegexComponent {
typealias RegexOutput = Double
func consuming(
_ input: String, startingAt index: String.Index, in bounds: Range<String.Index>
) throws -> (upperBound: String.Index, output: Double)? {
input[index...].withCString { startAddress in
var endAddress: UnsafeMutablePointer<CChar>!
let output = strtod(startAddress, &endAddress)
guard endAddress > startAddress else { return nil }
let parsedLength = startAddress.distance(to: endAddress)
let upperBound = input.utf8.index(index, offsetBy: parsedLength)
return (upperBound, output)
}
}
}
// MARK: - Regex II Summary
/*
- You can define custom parser
- Have builtin date, currency, double parser.
- Using Capture `transform` to extract enum cases from matches.
- Using .reluctant to take care of the pattern behind in Repeat check behavior.
- Using (...) to make pattern becomes capture group, using ?<name>pattern to assign name.
*/
// MARK: - Private helper
extension Date {
init?(_ dateString: String) {
let dateFormatter = DateFormatter()
dateFormatter.dateFormat = "yyyy-MM-dd HH:mm:ss Z"
dateFormatter.timeZone = TimeZone(secondsFromGMT: 0)
if let date = dateFormatter.date(from: dateString) {
self = date
} else {
return nil
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment