Skip to content

Instantly share code, notes, and snippets.

@relyky
Last active January 10, 2024 05:35
Show Gist options
  • Save relyky/346d909722452102d9b9 to your computer and use it in GitHub Desktop.
Save relyky/346d909722452102d9b9 to your computer and use it in GitHub Desktop.
RegEx實戰紀錄;字串解析:String.IndexOf, RegEx, Match, Regular Expression
using System.Text.RegularExpressions;
internal class AddressParser
{
public AddressParser(string address)
{
this.OrginalAddress = address;
this.ParseByRegex(address);
}
/// <summary>
/// 縣市
/// </summary>
public string City { get; set; } = default!;
/// <summary>
/// 鄉鎮市區
/// </summary>
public string Region { get; set; } = default!;
/// <summary>
/// 村里
/// </summary>
public string Village { get; set; } = default!;
/// <summary>
/// 鄰
/// </summary>
public string Neighbor { get; set; } = default!;
/// <summary>
/// 路
/// </summary>
public string Road { get; set; } = default!;
/// <summary>
/// 段
/// </summary>
public string Section { get; set; } = default!;
/// <summary>
/// 巷
/// </summary>
public string Lane { get; set; } = default!;
/// <summary>
/// 弄
/// </summary>
public string Alley { get; set; } = default!;
/// <summary>
/// 號
/// </summary>
public string No { get; set; } = default!;
/// <summary>
/// 樓
/// </summary>
public string Floor { get; set; } = default!;
/// <summary>
/// 其他
/// </summary>
public string Others { get; set; } = default!;
/// <summary>
/// 是否符合pattern規範
/// </summary>
public bool IsParseSuccessed { get; set; } = default!;
/// <summary>
/// 原始傳入的地址
/// </summary>
public string OrginalAddress { get; private set; } = default!;
private void ParseByRegex(string address)
{
var pattern = @"(?<city>\D+?[縣市])?" +
@"(?<region>\D+(市區|鎮區|鎮市|[鄉鎮市區]))?" +
@"(?<village>\D+[村里])?" +
@"(?<neighbor>\d+[鄰])?" +
@"(?<road>\D+?(村路|[路街道段]))?" +
@"(?<section>\D+段)?" +
@"(?<lane>\d+巷)?" +
@"(?<alley>\d+弄)?" +
@"(?<no>\d+((-|-|之)\d+)?號)?" +
@"(?<floor>\d+樓)?" +
@"(?<others>.*)";
Match match = Regex.Match(address, pattern);
if (match.Success)
{
this.IsParseSuccessed = true;
this.City = match.Groups["city"].ToString();
this.Region = match.Groups["region"].ToString();
this.Village = match.Groups["village"].ToString();
this.Neighbor = match.Groups["neighbor"].ToString();
this.Road = match.Groups["road"].ToString();
this.Section = match.Groups["section"].ToString();
this.Lane = match.Groups["lane"].ToString();
this.Alley = match.Groups["alley"].ToString();
this.No = match.Groups["no"].ToString();
this.Floor = match.Groups["floor"].ToString();
this.Others = match.Groups["others"].ToString();
}
}
}
/// 測試
/// var parserT = new AddressParser("某縣某市某里3鄰某路三段3巷3弄3之3號3樓");
/// var parserP = new AddressParser("某市某區某路3號");
/// var parser1 = new AddressParser("新北市土城區金城路二段100-10號100樓");
/// var parser2 = new AddressParser("桃園市龍潭區中正路三坑段999號");
/// <summary>
/// 再解析出:電話/非電話(即人名);
/// </summary>
public static IEnumerable<NameOrTel> CaptureNameOrTel(string loi_clause)
{
// 電話號碼 pattern
string ptnTel = @"(\([HOM]\))?[0-9()+]+[0-9\-()*~# ]*[0-9]+(\-[HOM]|\([HOM]\)|\(主要\))?";
// 人名 pattern := 中文名稱與難字 | 英文名稱
string ptnName = @"((\p{IsCJKUnifiedIdeographs}|[\ue000-\ue3ff])+([a-zA-z\s]+[a-zA-z\s-]*)?|[a-zA-z]+[a-zA-z\s-]*)";
// 人名或電話 pattern
string ptnNameOrTel = "(" + ptnName + "|" + ptnTel + ")";
foreach (Match m in Regex.Matches(loi_clause, ptnNameOrTel))
{
bool isTel = Regex.IsMatch(m.Value, ptnTel);
yield return new NameOrTel(isTel, m.Value);
}
yield return null; // 最後一定是null,以表示演算結束。
}
////////////////////////////////////////////////////////////////////
// 連續使用 RegEx.Match 解析一句完整的句字。當然文法是固定的簡單句子。
////////////////////////////////////////////////////////////////////
protected void btn02_Click(object sender, EventArgs e)
{
//## parse sentence
string str = txt02src.Text.Trim(); // "This is a BOOK." <-- 解析此文字
Regex regptn1 = new Regex("This"); // 先比對句子各個“字詞“先準備好。
Regex regptn2 = new Regex("is");
Regex regptn3 = new Regex("a");
Regex regptn4 = new Regex("BOOK");
Match m1 = regptn1.Match(str, 0); // 開始比對
if (!m1.Success)
{
txt02ans.Text = "X"; // not match & show
return;
}
txt02ans.Text = string.Format("[{0}:{1},{2}]", m1.Value, m1.Index, m1.Length); // match & show tracing information.
Match m2 = regptn2.Match(str, m1.Index + m1.Length + 1); // 自上次比對位置接續往下比對
if (!m2.Success)
{
txt02ans.Text = txt02ans.Text + " → X";
return;
}
txt02ans.Text = txt02ans.Text + " → " + string.Format("[{0}:{1},{2}]", m2.Value, m2.Index, m2.Length);
Match m3 = regptn3.Match(str, m2.Index + m2.Length + 1); // 再自上次比對位置接續往下比對不斷重複直到完成
if (!m3.Success)
{
txt02ans.Text = txt02ans.Text + " → X";
return;
}
txt02ans.Text = txt02ans.Text + " → " + string.Format("[{0}:{1},{2}]", m3.Value, m3.Index, m3.Length);
Match m4 = regptn4.Match(str, m3.Index + m3.Length + 1);
if (!m4.Success)
{
txt02ans.Text = txt02ans.Text + " → X";
return;
}
txt02ans.Text = txt02ans.Text + " → " + string.Format("[{0}:{1},{2}]", m4.Value, m4.Index, m4.Length);
}
// 密碼驗證:必需有數字、大寫英文字元、小寫英文字元、特殊字元,非空白字元8位以上
(?=.*[0-9])(?=.*[A-Z])(?=.*[a-z])(?=.*[!@#$%^&+=])\S{8,}
// 密碼驗證:必需有數字、大寫英文字元、小寫英文字元、特殊字元,只能填指定字元(英數字與特殊字元)8位以上
(?=.*\d)(?=.*[A-Z])(?=.*[a-z])(?=.*[!@#$%^&+=])[\w!@#$%^&+=]{8,}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// 使用 RegEx 解析文字報表文件檔
// 如同報表,可先以節段區分出如:RH (report header), PH (page header), DT (detail)
// 解析單位為一行。
// 函式類別只有兩種: IsMatch<REPORT_SECTION>, TryParse<REPORT_SECTION>
//////////////////////////////////////////////////////////////////////////////////////////////////////
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace TEST
{
public class DMR_RptParseHelper
{
#region properties
// detail fields
public string CustomerID = string.Empty;
public string ChangeType = string.Empty;
public string ChangeTypeName = string.Empty;
public string DataFrom = string.Empty;
public string DataTo = string.Empty;
public string VpUserID = string.Empty;
// header fields
public string ProcDate = string.Empty;
#endregion
//public enum ReportSectionEnum
//{
// UNKNOW = 0,
// RH1,
// RH2,
// RH3,
// PH1,
// PH2,
// PH3,
// DT1,
// DT2
//}
public override string ToString()
{
//return base.ToString();
return string.Format("Acc[{0}]", this.CustomerID)
+ string.Format(", Chg[{0}]", this.ChangeType)
+ string.Format(", ChgN[{0}]", this.ChangeTypeName)
+ string.Format(", From[{0}]", this.DataFrom)
+ string.Format(", To[{0}]", this.DataTo)
+ string.Format(", V+ User[{0}]", this.VpUserID)
+ string.Format(", ProcDate[{0}]", this.ProcDate);
}
public bool TryParseDetailLine1(string lineDT1)
{
//## parse sentence
// 預設失敗
this.CustomerID = string.Empty;
this.ChangeType = string.Empty;
this.ChangeTypeName = string.Empty;
this.DataFrom = string.Empty;
this.VpUserID = string.Empty;
// GO
Regex ptn1 = new Regex(@"\w+"); // Customer ID
Regex ptn2 = new Regex(@"\d+"); // Change Type
Regex ptn3 = new Regex(@"\S+(\s\S+)*\s+FROM"); // Change Type Name
Regex ptn4 = new Regex(@"FROM=\s?\S+"); // Data From
Regex ptn5 = new Regex(@"[\w.]+"); // V+ User ID
//# parse field 1 : CustomerID
Match m1 = ptn1.Match(lineDT1, 0);
if (!m1.Success)
return false;
// match
this.CustomerID = m1.Value.Trim().TrimStart('0');
//# parse field 2 : Change Type
Match m2 = ptn2.Match(lineDT1, m1.Index + m1.Length + 1);
if (!m2.Success)
return false;
// match
this.ChangeType = m2.Value.Trim();
//# parse field 3 : Change Type Name
Match m3 = ptn3.Match(lineDT1, m2.Index + m2.Length + 1);
if (!m3.Success)
return false;
// match
this.ChangeTypeName = m3.Value.Remove(m3.Value.Length - 4, 4).Trim();
//# parse field 4 : Data From
Match m4 = ptn4.Match(lineDT1, m3.Index + m3.Length - 4);
if (!m4.Success)
return false;
// match
this.DataFrom = m4.Value.Substring(5).Trim();
//# parse field 5 : V+ User ID
Match m5 = ptn5.Match(lineDT1, m4.Index + m4.Length + 1);
if (!m5.Success)
return false;
// match
this.VpUserID = m5.Value.Trim().Replace(".", "");
// success
return true;
}
public bool TryParseDetailLine2(string lineDT2)
{
//# parse field: TO
Regex ptnTO = new Regex(@"TO=\s?\S+");
Match m = ptnTO.Match(lineDT2, 0);
if (!m.Success)
{
// not match
this.DataTo = string.Empty;
return false;
}
// match
string mstr = m.Value.Substring(3).Trim();
this.DataTo = mstr;
// success
return true;
}
public bool TryParseReportHeaderLine3(string lineRH3)
{
//# parse field: PROC DATE
Regex ptnPROC_DATE = new Regex(@"PROC DATE \d\d/\d\d/\d{4}");
Match m = ptnPROC_DATE.Match(lineRH3, 0);
if (!m.Success)
{
// not match
this.ProcDate = string.Empty;
return false;
}
// match & parsing
string mstr = m.Value.Substring(9);
DateTime dt;
if (!DateTime.TryParse(mstr, out dt))
{
// parsing fail
this.ProcDate = string.Empty;
return false;
}
// success
this.ProcDate = dt.ToString("yyyy/MM/dd");
return true;
}
public static bool IsMatchReportHeadLine1(string line)
{
//string ptn = @"AR000000 - R09 ANZ - TAIWAN ";
string ptn = @"^((AR000000 - (R02|R05|R09|R59))\s+(ANZ - TAIWAN)\s{40,})$"; // 注意:後面有40個空白字元以上
return Regex.IsMatch(line, ptn);
}
public static bool IsMatchReportHeadLine2(string line)
{
//string ptn = @"021 - ANZ CUST NAME-ADDRESS MAINTENACE FILE DATE 08/26/2016 PAGE 6 ";
string ptn = @"^((021 - ANZ)\s+(CUST NAME-ADDRESS MAINTENACE)\s+(FILE DATE )\d\d/\d\d/\d{4}\s+(PAGE)\s+\d+)$";
return Regex.IsMatch(line.Trim(), ptn);
}
public static bool IsMatchReportHeadLine3(string line)
{
//string ptn = @" PROC DATE 08/27/2016 TIME 00.15.59 ";
string ptn = @"^(\s{50,}(PROC DATE )\d\d/\d\d/\d{4}\s+(TIME )\d\d\.\d\d\.\d\d\s*)$"; // 注意:前面有50個空白字元以上
return Regex.IsMatch(line, ptn);
}
public static bool IsMatchPageHeadLine1(string line)
{
//string ptn = @" MAINT *--SECURITY NAME--* ";
string ptn = @"^((MAINT)\s+(\*--SECURITY NAME--\*))$";
return Regex.IsMatch(line.Trim(), ptn);
}
public static bool IsMatchPageHeadLine2(string line)
{
//string ptn = @" ACCOUNT CODE S DATE TIME ";
string ptn = @"^((ACCOUNT)\s+(CODE)\s+(S)\s+(DATE)\s+(TIME))$";
return Regex.IsMatch(line.Trim(), ptn);
}
public static bool IsMatchPageHeadLine3(string line)
{
//string ptn = @" NUMBER + OCC FIELD DESCRIPTION **-----------------CHANGE DATA-------------** C STAMP STAMP TERM";
string ptn = @"^((NUMBER)\s+(\+ OCC)\s+(FIELD DESCRIPTION)\s.+(CHANGE DATA).+(C)\s+(STAMP)\s+(STAMP)\s+(TERM))$";
return Regex.IsMatch(line.Trim(), ptn);
}
public static bool IsMatchDetailLine1(string line)
{
//string ptn = @"0000000000A122326123 9206 OWNER ADDRESS 1 FROM= 樂利二街962巷35號111樓 CTW7.61 ";
//Regex ptn1 = new Regex(@"\w+"); // Customer ID
//Regex ptn2 = new Regex(@"\d+"); // Change Type
//Regex ptn3 = new Regex(@"\S+(\s\S+)*\s+FROM"); // Change Type Name
//Regex ptn4 = new Regex(@"FROM=\s?\S+"); // Data From
//Regex ptn5 = new Regex(@"[\w.]+"); // V+ User ID
string ptn = @"^((\w+)\s+(\d+)\s+(\S+(\s\S+)*)\s+(FROM=\s?\S+)\s+([\w.]+))$"; // 注意:前面有30個空白字元以上
return Regex.IsMatch(line.Trim(), ptn);
}
public static bool IsMatchDetailLine2(string line)
{
//string ptn = @" TO= 樂利九街962巷35號111樓 L 082216 155530 GOIJ";
string ptn = @"^(\s{30,}(TO=)(\s?\S+)\s+([A-Z])\s+(\d{6})\s+(\d{6})\s+(\w*)\s*)$"; // 注意:前面有30個空白字元以上
return Regex.IsMatch(line, ptn);
}
}
}
/////////////////// 應用/測試紀錄 //////////////
protected void btnLine_Click(object sender, EventArgs e)
{
string line1 = txtLine1.Text;
string line2 = txtLine2.Text;
bool ret1, ret2;
txtLineAns.Text = string.Empty;
// match check
if(DMR_RptParseHelper.IsMatchDetailLine1(line1))
txtLineAns.Text += "DT1 Y ";
else
txtLineAns.Text += "DT1 N ";
if(DMR_RptParseHelper.IsMatchDetailLine2(line2))
txtLineAns.Text += "DT2 Y ";
else
txtLineAns.Text += "DT2 N ";
// parsing
DMR_RptParseHelper rptInfo = new DMR_RptParseHelper();
ret1 = rptInfo.TryParseDetailLine1(line1);
ret2 = rptInfo.TryParseDetailLine2(line2);
if(ret1 && ret2) // success
txtLineAns.Text += "; " + rptInfo.ToString();
else
txtLineAns.Text += "; FAIL! → " + rptInfo.ToString();
}
protected void btnHead_Click(object sender, EventArgs e)
{
string head1 = txtHeadLine1.Text;
string head2 = txtHeadLine2.Text;
string head3 = txtHeadLine3.Text;
string head4 = txtHeadLine4.Text;
string head5 = txtHeadLine5.Text;
string head6 = txtHeadLine6.Text;
txtHeadAns.Text = string.Empty;
if (DMR_RptParseHelper.IsMatchReportHeadLine1(head1))
txtHeadAns.Text += "RH1 Y "; // show match or not
else
txtHeadAns.Text += "RH1 N ";
if (DMR_RptParseHelper.IsMatchReportHeadLine2(head2))
txtHeadAns.Text += "RH2 Y ";
else
txtHeadAns.Text += "RH2 N ";
if (DMR_RptParseHelper.IsMatchReportHeadLine3(head3))
txtHeadAns.Text += "RH3 Y ";
else
txtHeadAns.Text += "RH3 N ";
if (DMR_RptParseHelper.IsMatchPageHeadLine1(head4))
txtHeadAns.Text += "PH1 Y ";
else
txtHeadAns.Text += "PH1 N ";
if (DMR_RptParseHelper.IsMatchPageHeadLine2(head5))
txtHeadAns.Text += "PH2 Y ";
else
txtHeadAns.Text += "PH2 N ";
if (DMR_RptParseHelper.IsMatchPageHeadLine3(head6))
txtHeadAns.Text += "PH3 Y ";
else
txtHeadAns.Text += "PH3 N ";
DMR_RptParseHelper rptInfo = new DMR_RptParseHelper();
bool ret = rptInfo.TryParseReportHeaderLine3(txtHeadLine3.Text); // parse data
if (ret)
{
txtHeadAns.Text += ", PROC_DATE : " + rptInfo.ProcDate;
}
}
/// <summary>
/// 先取出LOI子句;此段確雖未使用RegEX還是值得參考。
/// </summary>
public static IEnumerable<string> QueryLoiClause(string notes_for_tsd)
{
//1:只解有"LOI"部份,逗號","不夠精準。。
//2:到六個結束:CBC, DCI, PRO, HYBF, 客戶已辦理授權,被授權人。
int loi_bgn_idx;
int loi_end_idx = 0;
do
{
loi_bgn_idx = notes_for_tsd.IndexOf("LOI:", loi_end_idx); // 先找"LOI:"。
if (loi_bgn_idx > 0)
{
// 可能有多種結束方式,每個都試。
int[] idxs = new int[]{
notes_for_tsd.IndexOf("CBC", loi_bgn_idx),
notes_for_tsd.IndexOf("DCI", loi_bgn_idx),
notes_for_tsd.IndexOf("PRO", loi_bgn_idx),
notes_for_tsd.IndexOf("HYBF", loi_bgn_idx),
notes_for_tsd.IndexOf("客戶已辦理授權", loi_bgn_idx),
notes_for_tsd.IndexOf("被授權人", loi_bgn_idx),
notes_for_tsd.IndexOf("LOI:", loi_bgn_idx + 4), // 連續有多個"LOI:"..."LOI:"...
notes_for_tsd.Length // LOI 也可能放在最後。
};
loi_end_idx = idxs.Where(c => c != -1).Min(); // 取有符合中之最小值。
yield return notes_for_tsd.Substring(loi_bgn_idx + 4, loi_end_idx - loi_bgn_idx - 4);
}
} while (loi_bgn_idx > 0);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment