Skip to content

Instantly share code, notes, and snippets.

@seong-min-s
Created March 4, 2021 00:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seong-min-s/8b375948cc5c00616d5fcf71f52ec09a to your computer and use it in GitHub Desktop.
Save seong-min-s/8b375948cc5c00616d5fcf71f52ec09a to your computer and use it in GitHub Desktop.
전처리 코드
using System;
using System.IO;
using System.Data;
using HtmlAgilityPack;
namespace Preprocessing
{
class Program
{
static string RtfToHtml(string path)
{
SautinSoft.RtfToHtml r = new SautinSoft.RtfToHtml();
string rtfString = File.ReadAllText(path);
r.ImageStyle.IncludeImageInHtml = true;
string htmlString = r.ConvertString(rtfString);
return htmlString;
}
static void HtmlToCsv(string html, string path)
{
HtmlDocument htmlDoc = new HtmlDocument();
htmlDoc.LoadHtml(html);
HtmlNodeCollection tableNode1 = htmlDoc.DocumentNode.SelectNodes("//table");
Console.WriteLine(tableNode1.Count);
HtmlNode tableNode = htmlDoc.DocumentNode.SelectSingleNode("//table");
HtmlNodeCollection trNodes = tableNode.SelectNodes("tr");
DataTable dataTable = new DataTable();
DataRow row = null;
for (int i = 0; i < trNodes.Count; i++)
{
HtmlNodeCollection tdNodes = trNodes[i].SelectNodes("td");
if (i == 0)
{
for (int j = 0; j < tdNodes.Count; j++)
{
string columnName = tdNodes[j].InnerText;
columnName = columnName.Replace('\n', ' ');
columnName = columnName.Replace('\r', ' ');
columnName = columnName.Replace("&mu;", "m");
dataTable.Columns.Add(new DataColumn(columnName, typeof(string)));
}
}
else
{
row = dataTable.NewRow();
for (int j = 0; j < tdNodes.Count; j++)
{
row[j] = tdNodes[j].InnerText;
}
dataTable.Rows.Add(row);
}
}
for (int i = 0; i < dataTable.Rows.Count; i++)
{
for (int j = 0; j < dataTable.Columns.Count; j++)
{
Console.WriteLine(dataTable.Rows[i][j].ToString());
}
Console.WriteLine("=====");
}
ExportToCSV(dataTable, path);
}
static void ExportToCSV(DataTable dtDataTable, string strFilePath)
{
StreamWriter sw = new StreamWriter(strFilePath, false, System.Text.Encoding.Default);
//headers
for (int i = 0; i < dtDataTable.Columns.Count; i++)
{
sw.Write(dtDataTable.Columns[i].ToString().Trim());
if (i < dtDataTable.Columns.Count - 1)
{
sw.Write(",");
}
}
sw.Write(sw.NewLine);
foreach (DataRow dr in dtDataTable.Rows)
{
for (int i = 0; i < dtDataTable.Columns.Count; i++)
{
if (!Convert.IsDBNull(dr[i]))
{
string value = dr[i].ToString().Trim();
if (value.Contains(','))
{
value = String.Format("\"{0}\"", value);
sw.Write(value);
}
else
{
sw.Write(dr[i].ToString().Trim());
}
}
if (i < dtDataTable.Columns.Count - 1)
{
sw.Write(",");
}
}
sw.Write(sw.NewLine);
}
sw.Close();
}
static void Main(string[] args)
{
/* path sample = @"c:\filename.rtf */
string a = Program.RtfToHtml(path sample);
HtmlToCsv(a);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment