Last active
January 10, 2024 05:35
-
-
Save relyky/346d909722452102d9b9 to your computer and use it in GitHub Desktop.
RegEx實戰紀錄;字串解析:String.IndexOf, RegEx, Match, Regular Expression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Text.RegularExpressions; | |
internal class AddressParser | |
{ | |
public AddressParser(string address) | |
{ | |
this.OrginalAddress = address; | |
this.ParseByRegex(address); | |
} | |
/// <summary> | |
/// 縣市 | |
/// </summary> | |
public string City { get; set; } = default!; | |
/// <summary> | |
/// 鄉鎮市區 | |
/// </summary> | |
public string Region { get; set; } = default!; | |
/// <summary> | |
/// 村里 | |
/// </summary> | |
public string Village { get; set; } = default!; | |
/// <summary> | |
/// 鄰 | |
/// </summary> | |
public string Neighbor { get; set; } = default!; | |
/// <summary> | |
/// 路 | |
/// </summary> | |
public string Road { get; set; } = default!; | |
/// <summary> | |
/// 段 | |
/// </summary> | |
public string Section { get; set; } = default!; | |
/// <summary> | |
/// 巷 | |
/// </summary> | |
public string Lane { get; set; } = default!; | |
/// <summary> | |
/// 弄 | |
/// </summary> | |
public string Alley { get; set; } = default!; | |
/// <summary> | |
/// 號 | |
/// </summary> | |
public string No { get; set; } = default!; | |
/// <summary> | |
/// 樓 | |
/// </summary> | |
public string Floor { get; set; } = default!; | |
/// <summary> | |
/// 其他 | |
/// </summary> | |
public string Others { get; set; } = default!; | |
/// <summary> | |
/// 是否符合pattern規範 | |
/// </summary> | |
public bool IsParseSuccessed { get; set; } = default!; | |
/// <summary> | |
/// 原始傳入的地址 | |
/// </summary> | |
public string OrginalAddress { get; private set; } = default!; | |
private void ParseByRegex(string address) | |
{ | |
var pattern = @"(?<city>\D+?[縣市])?" + | |
@"(?<region>\D+(市區|鎮區|鎮市|[鄉鎮市區]))?" + | |
@"(?<village>\D+[村里])?" + | |
@"(?<neighbor>\d+[鄰])?" + | |
@"(?<road>\D+?(村路|[路街道段]))?" + | |
@"(?<section>\D+段)?" + | |
@"(?<lane>\d+巷)?" + | |
@"(?<alley>\d+弄)?" + | |
@"(?<no>\d+((-|-|之)\d+)?號)?" + | |
@"(?<floor>\d+樓)?" + | |
@"(?<others>.*)"; | |
Match match = Regex.Match(address, pattern); | |
if (match.Success) | |
{ | |
this.IsParseSuccessed = true; | |
this.City = match.Groups["city"].ToString(); | |
this.Region = match.Groups["region"].ToString(); | |
this.Village = match.Groups["village"].ToString(); | |
this.Neighbor = match.Groups["neighbor"].ToString(); | |
this.Road = match.Groups["road"].ToString(); | |
this.Section = match.Groups["section"].ToString(); | |
this.Lane = match.Groups["lane"].ToString(); | |
this.Alley = match.Groups["alley"].ToString(); | |
this.No = match.Groups["no"].ToString(); | |
this.Floor = match.Groups["floor"].ToString(); | |
this.Others = match.Groups["others"].ToString(); | |
} | |
} | |
} | |
/// 測試 | |
/// var parserT = new AddressParser("某縣某市某里3鄰某路三段3巷3弄3之3號3樓"); | |
/// var parserP = new AddressParser("某市某區某路3號"); | |
/// var parser1 = new AddressParser("新北市土城區金城路二段100-10號100樓"); | |
/// var parser2 = new AddressParser("桃園市龍潭區中正路三坑段999號"); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 再解析出:電話/非電話(即人名); | |
/// </summary> | |
public static IEnumerable<NameOrTel> CaptureNameOrTel(string loi_clause) | |
{ | |
// 電話號碼 pattern | |
string ptnTel = @"(\([HOM]\))?[0-9()+]+[0-9\-()*~# ]*[0-9]+(\-[HOM]|\([HOM]\)|\(主要\))?"; | |
// 人名 pattern := 中文名稱與難字 | 英文名稱 | |
string ptnName = @"((\p{IsCJKUnifiedIdeographs}|[\ue000-\ue3ff])+([a-zA-z\s]+[a-zA-z\s-]*)?|[a-zA-z]+[a-zA-z\s-]*)"; | |
// 人名或電話 pattern | |
string ptnNameOrTel = "(" + ptnName + "|" + ptnTel + ")"; | |
foreach (Match m in Regex.Matches(loi_clause, ptnNameOrTel)) | |
{ | |
bool isTel = Regex.IsMatch(m.Value, ptnTel); | |
yield return new NameOrTel(isTel, m.Value); | |
} | |
yield return null; // 最後一定是null,以表示演算結束。 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//////////////////////////////////////////////////////////////////// | |
// 連續使用 RegEx.Match 解析一句完整的句字。當然文法是固定的簡單句子。 | |
//////////////////////////////////////////////////////////////////// | |
protected void btn02_Click(object sender, EventArgs e) | |
{ | |
//## parse sentence | |
string str = txt02src.Text.Trim(); // "This is a BOOK." <-- 解析此文字 | |
Regex regptn1 = new Regex("This"); // 先比對句子各個“字詞“先準備好。 | |
Regex regptn2 = new Regex("is"); | |
Regex regptn3 = new Regex("a"); | |
Regex regptn4 = new Regex("BOOK"); | |
Match m1 = regptn1.Match(str, 0); // 開始比對 | |
if (!m1.Success) | |
{ | |
txt02ans.Text = "X"; // not match & show | |
return; | |
} | |
txt02ans.Text = string.Format("[{0}:{1},{2}]", m1.Value, m1.Index, m1.Length); // match & show tracing information. | |
Match m2 = regptn2.Match(str, m1.Index + m1.Length + 1); // 自上次比對位置接續往下比對 | |
if (!m2.Success) | |
{ | |
txt02ans.Text = txt02ans.Text + " → X"; | |
return; | |
} | |
txt02ans.Text = txt02ans.Text + " → " + string.Format("[{0}:{1},{2}]", m2.Value, m2.Index, m2.Length); | |
Match m3 = regptn3.Match(str, m2.Index + m2.Length + 1); // 再自上次比對位置接續往下比對不斷重複直到完成 | |
if (!m3.Success) | |
{ | |
txt02ans.Text = txt02ans.Text + " → X"; | |
return; | |
} | |
txt02ans.Text = txt02ans.Text + " → " + string.Format("[{0}:{1},{2}]", m3.Value, m3.Index, m3.Length); | |
Match m4 = regptn4.Match(str, m3.Index + m3.Length + 1); | |
if (!m4.Success) | |
{ | |
txt02ans.Text = txt02ans.Text + " → X"; | |
return; | |
} | |
txt02ans.Text = txt02ans.Text + " → " + string.Format("[{0}:{1},{2}]", m4.Value, m4.Index, m4.Length); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// 密碼驗證:必需有數字、大寫英文字元、小寫英文字元、特殊字元,非空白字元8位以上 | |
(?=.*[0-9])(?=.*[A-Z])(?=.*[a-z])(?=.*[!@#$%^&+=])\S{8,} | |
// 密碼驗證:必需有數字、大寫英文字元、小寫英文字元、特殊字元,只能填指定字元(英數字與特殊字元)8位以上 | |
(?=.*\d)(?=.*[A-Z])(?=.*[a-z])(?=.*[!@#$%^&+=])[\w!@#$%^&+=]{8,} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
////////////////////////////////////////////////////////////////////////////////////////////////////// | |
// 使用 RegEx 解析文字報表文件檔 | |
// 如同報表,可先以節段區分出如:RH (report header), PH (page header), DT (detail) | |
// 解析單位為一行。 | |
// 函式類別只有兩種: IsMatch<REPORT_SECTION>, TryParse<REPORT_SECTION> | |
////////////////////////////////////////////////////////////////////////////////////////////////////// | |
using System; | |
using System.Collections.Generic; | |
using System.Text.RegularExpressions; | |
namespace TEST | |
{ | |
public class DMR_RptParseHelper | |
{ | |
#region properties | |
// detail fields | |
public string CustomerID = string.Empty; | |
public string ChangeType = string.Empty; | |
public string ChangeTypeName = string.Empty; | |
public string DataFrom = string.Empty; | |
public string DataTo = string.Empty; | |
public string VpUserID = string.Empty; | |
// header fields | |
public string ProcDate = string.Empty; | |
#endregion | |
//public enum ReportSectionEnum | |
//{ | |
// UNKNOW = 0, | |
// RH1, | |
// RH2, | |
// RH3, | |
// PH1, | |
// PH2, | |
// PH3, | |
// DT1, | |
// DT2 | |
//} | |
public override string ToString() | |
{ | |
//return base.ToString(); | |
return string.Format("Acc[{0}]", this.CustomerID) | |
+ string.Format(", Chg[{0}]", this.ChangeType) | |
+ string.Format(", ChgN[{0}]", this.ChangeTypeName) | |
+ string.Format(", From[{0}]", this.DataFrom) | |
+ string.Format(", To[{0}]", this.DataTo) | |
+ string.Format(", V+ User[{0}]", this.VpUserID) | |
+ string.Format(", ProcDate[{0}]", this.ProcDate); | |
} | |
public bool TryParseDetailLine1(string lineDT1) | |
{ | |
//## parse sentence | |
// 預設失敗 | |
this.CustomerID = string.Empty; | |
this.ChangeType = string.Empty; | |
this.ChangeTypeName = string.Empty; | |
this.DataFrom = string.Empty; | |
this.VpUserID = string.Empty; | |
// GO | |
Regex ptn1 = new Regex(@"\w+"); // Customer ID | |
Regex ptn2 = new Regex(@"\d+"); // Change Type | |
Regex ptn3 = new Regex(@"\S+(\s\S+)*\s+FROM"); // Change Type Name | |
Regex ptn4 = new Regex(@"FROM=\s?\S+"); // Data From | |
Regex ptn5 = new Regex(@"[\w.]+"); // V+ User ID | |
//# parse field 1 : CustomerID | |
Match m1 = ptn1.Match(lineDT1, 0); | |
if (!m1.Success) | |
return false; | |
// match | |
this.CustomerID = m1.Value.Trim().TrimStart('0'); | |
//# parse field 2 : Change Type | |
Match m2 = ptn2.Match(lineDT1, m1.Index + m1.Length + 1); | |
if (!m2.Success) | |
return false; | |
// match | |
this.ChangeType = m2.Value.Trim(); | |
//# parse field 3 : Change Type Name | |
Match m3 = ptn3.Match(lineDT1, m2.Index + m2.Length + 1); | |
if (!m3.Success) | |
return false; | |
// match | |
this.ChangeTypeName = m3.Value.Remove(m3.Value.Length - 4, 4).Trim(); | |
//# parse field 4 : Data From | |
Match m4 = ptn4.Match(lineDT1, m3.Index + m3.Length - 4); | |
if (!m4.Success) | |
return false; | |
// match | |
this.DataFrom = m4.Value.Substring(5).Trim(); | |
//# parse field 5 : V+ User ID | |
Match m5 = ptn5.Match(lineDT1, m4.Index + m4.Length + 1); | |
if (!m5.Success) | |
return false; | |
// match | |
this.VpUserID = m5.Value.Trim().Replace(".", ""); | |
// success | |
return true; | |
} | |
public bool TryParseDetailLine2(string lineDT2) | |
{ | |
//# parse field: TO | |
Regex ptnTO = new Regex(@"TO=\s?\S+"); | |
Match m = ptnTO.Match(lineDT2, 0); | |
if (!m.Success) | |
{ | |
// not match | |
this.DataTo = string.Empty; | |
return false; | |
} | |
// match | |
string mstr = m.Value.Substring(3).Trim(); | |
this.DataTo = mstr; | |
// success | |
return true; | |
} | |
public bool TryParseReportHeaderLine3(string lineRH3) | |
{ | |
//# parse field: PROC DATE | |
Regex ptnPROC_DATE = new Regex(@"PROC DATE \d\d/\d\d/\d{4}"); | |
Match m = ptnPROC_DATE.Match(lineRH3, 0); | |
if (!m.Success) | |
{ | |
// not match | |
this.ProcDate = string.Empty; | |
return false; | |
} | |
// match & parsing | |
string mstr = m.Value.Substring(9); | |
DateTime dt; | |
if (!DateTime.TryParse(mstr, out dt)) | |
{ | |
// parsing fail | |
this.ProcDate = string.Empty; | |
return false; | |
} | |
// success | |
this.ProcDate = dt.ToString("yyyy/MM/dd"); | |
return true; | |
} | |
public static bool IsMatchReportHeadLine1(string line) | |
{ | |
//string ptn = @"AR000000 - R09 ANZ - TAIWAN "; | |
string ptn = @"^((AR000000 - (R02|R05|R09|R59))\s+(ANZ - TAIWAN)\s{40,})$"; // 注意:後面有40個空白字元以上 | |
return Regex.IsMatch(line, ptn); | |
} | |
public static bool IsMatchReportHeadLine2(string line) | |
{ | |
//string ptn = @"021 - ANZ CUST NAME-ADDRESS MAINTENACE FILE DATE 08/26/2016 PAGE 6 "; | |
string ptn = @"^((021 - ANZ)\s+(CUST NAME-ADDRESS MAINTENACE)\s+(FILE DATE )\d\d/\d\d/\d{4}\s+(PAGE)\s+\d+)$"; | |
return Regex.IsMatch(line.Trim(), ptn); | |
} | |
public static bool IsMatchReportHeadLine3(string line) | |
{ | |
//string ptn = @" PROC DATE 08/27/2016 TIME 00.15.59 "; | |
string ptn = @"^(\s{50,}(PROC DATE )\d\d/\d\d/\d{4}\s+(TIME )\d\d\.\d\d\.\d\d\s*)$"; // 注意:前面有50個空白字元以上 | |
return Regex.IsMatch(line, ptn); | |
} | |
public static bool IsMatchPageHeadLine1(string line) | |
{ | |
//string ptn = @" MAINT *--SECURITY NAME--* "; | |
string ptn = @"^((MAINT)\s+(\*--SECURITY NAME--\*))$"; | |
return Regex.IsMatch(line.Trim(), ptn); | |
} | |
public static bool IsMatchPageHeadLine2(string line) | |
{ | |
//string ptn = @" ACCOUNT CODE S DATE TIME "; | |
string ptn = @"^((ACCOUNT)\s+(CODE)\s+(S)\s+(DATE)\s+(TIME))$"; | |
return Regex.IsMatch(line.Trim(), ptn); | |
} | |
public static bool IsMatchPageHeadLine3(string line) | |
{ | |
//string ptn = @" NUMBER + OCC FIELD DESCRIPTION **-----------------CHANGE DATA-------------** C STAMP STAMP TERM"; | |
string ptn = @"^((NUMBER)\s+(\+ OCC)\s+(FIELD DESCRIPTION)\s.+(CHANGE DATA).+(C)\s+(STAMP)\s+(STAMP)\s+(TERM))$"; | |
return Regex.IsMatch(line.Trim(), ptn); | |
} | |
public static bool IsMatchDetailLine1(string line) | |
{ | |
//string ptn = @"0000000000A122326123 9206 OWNER ADDRESS 1 FROM= 樂利二街962巷35號111樓 CTW7.61 "; | |
//Regex ptn1 = new Regex(@"\w+"); // Customer ID | |
//Regex ptn2 = new Regex(@"\d+"); // Change Type | |
//Regex ptn3 = new Regex(@"\S+(\s\S+)*\s+FROM"); // Change Type Name | |
//Regex ptn4 = new Regex(@"FROM=\s?\S+"); // Data From | |
//Regex ptn5 = new Regex(@"[\w.]+"); // V+ User ID | |
string ptn = @"^((\w+)\s+(\d+)\s+(\S+(\s\S+)*)\s+(FROM=\s?\S+)\s+([\w.]+))$"; // 注意:前面有30個空白字元以上 | |
return Regex.IsMatch(line.Trim(), ptn); | |
} | |
public static bool IsMatchDetailLine2(string line) | |
{ | |
//string ptn = @" TO= 樂利九街962巷35號111樓 L 082216 155530 GOIJ"; | |
string ptn = @"^(\s{30,}(TO=)(\s?\S+)\s+([A-Z])\s+(\d{6})\s+(\d{6})\s+(\w*)\s*)$"; // 注意:前面有30個空白字元以上 | |
return Regex.IsMatch(line, ptn); | |
} | |
} | |
} | |
/////////////////// 應用/測試紀錄 ////////////// | |
protected void btnLine_Click(object sender, EventArgs e) | |
{ | |
string line1 = txtLine1.Text; | |
string line2 = txtLine2.Text; | |
bool ret1, ret2; | |
txtLineAns.Text = string.Empty; | |
// match check | |
if(DMR_RptParseHelper.IsMatchDetailLine1(line1)) | |
txtLineAns.Text += "DT1 Y "; | |
else | |
txtLineAns.Text += "DT1 N "; | |
if(DMR_RptParseHelper.IsMatchDetailLine2(line2)) | |
txtLineAns.Text += "DT2 Y "; | |
else | |
txtLineAns.Text += "DT2 N "; | |
// parsing | |
DMR_RptParseHelper rptInfo = new DMR_RptParseHelper(); | |
ret1 = rptInfo.TryParseDetailLine1(line1); | |
ret2 = rptInfo.TryParseDetailLine2(line2); | |
if(ret1 && ret2) // success | |
txtLineAns.Text += "; " + rptInfo.ToString(); | |
else | |
txtLineAns.Text += "; FAIL! → " + rptInfo.ToString(); | |
} | |
protected void btnHead_Click(object sender, EventArgs e) | |
{ | |
string head1 = txtHeadLine1.Text; | |
string head2 = txtHeadLine2.Text; | |
string head3 = txtHeadLine3.Text; | |
string head4 = txtHeadLine4.Text; | |
string head5 = txtHeadLine5.Text; | |
string head6 = txtHeadLine6.Text; | |
txtHeadAns.Text = string.Empty; | |
if (DMR_RptParseHelper.IsMatchReportHeadLine1(head1)) | |
txtHeadAns.Text += "RH1 Y "; // show match or not | |
else | |
txtHeadAns.Text += "RH1 N "; | |
if (DMR_RptParseHelper.IsMatchReportHeadLine2(head2)) | |
txtHeadAns.Text += "RH2 Y "; | |
else | |
txtHeadAns.Text += "RH2 N "; | |
if (DMR_RptParseHelper.IsMatchReportHeadLine3(head3)) | |
txtHeadAns.Text += "RH3 Y "; | |
else | |
txtHeadAns.Text += "RH3 N "; | |
if (DMR_RptParseHelper.IsMatchPageHeadLine1(head4)) | |
txtHeadAns.Text += "PH1 Y "; | |
else | |
txtHeadAns.Text += "PH1 N "; | |
if (DMR_RptParseHelper.IsMatchPageHeadLine2(head5)) | |
txtHeadAns.Text += "PH2 Y "; | |
else | |
txtHeadAns.Text += "PH2 N "; | |
if (DMR_RptParseHelper.IsMatchPageHeadLine3(head6)) | |
txtHeadAns.Text += "PH3 Y "; | |
else | |
txtHeadAns.Text += "PH3 N "; | |
DMR_RptParseHelper rptInfo = new DMR_RptParseHelper(); | |
bool ret = rptInfo.TryParseReportHeaderLine3(txtHeadLine3.Text); // parse data | |
if (ret) | |
{ | |
txtHeadAns.Text += ", PROC_DATE : " + rptInfo.ProcDate; | |
} | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 先取出LOI子句;此段確雖未使用RegEX還是值得參考。 | |
/// </summary> | |
public static IEnumerable<string> QueryLoiClause(string notes_for_tsd) | |
{ | |
//1:只解有"LOI"部份,逗號","不夠精準。。 | |
//2:到六個結束:CBC, DCI, PRO, HYBF, 客戶已辦理授權,被授權人。 | |
int loi_bgn_idx; | |
int loi_end_idx = 0; | |
do | |
{ | |
loi_bgn_idx = notes_for_tsd.IndexOf("LOI:", loi_end_idx); // 先找"LOI:"。 | |
if (loi_bgn_idx > 0) | |
{ | |
// 可能有多種結束方式,每個都試。 | |
int[] idxs = new int[]{ | |
notes_for_tsd.IndexOf("CBC", loi_bgn_idx), | |
notes_for_tsd.IndexOf("DCI", loi_bgn_idx), | |
notes_for_tsd.IndexOf("PRO", loi_bgn_idx), | |
notes_for_tsd.IndexOf("HYBF", loi_bgn_idx), | |
notes_for_tsd.IndexOf("客戶已辦理授權", loi_bgn_idx), | |
notes_for_tsd.IndexOf("被授權人", loi_bgn_idx), | |
notes_for_tsd.IndexOf("LOI:", loi_bgn_idx + 4), // 連續有多個"LOI:"..."LOI:"... | |
notes_for_tsd.Length // LOI 也可能放在最後。 | |
}; | |
loi_end_idx = idxs.Where(c => c != -1).Min(); // 取有符合中之最小值。 | |
yield return notes_for_tsd.Substring(loi_bgn_idx + 4, loi_end_idx - loi_bgn_idx - 4); | |
} | |
} while (loi_bgn_idx > 0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment