Skip to content

Instantly share code, notes, and snippets.

@kovachwt
Created May 2, 2020 08:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kovachwt/dd6b39f0af8abcc9451415dd6a8cecc2 to your computer and use it in GitHub Desktop.
Save kovachwt/dd6b39f0af8abcc9451415dd6a8cecc2 to your computer and use it in GitHub Desktop.
Remove HTML tags from a Drupal node text dump
using System;
using System.IO;
using System.Collections.Generic;
using System.Text.RegularExpressions;
namespace txtsredvach
{
class Program
{
static string outputFile;
static void Main(string[] args)
{
string file = args[0];
file = Path.Combine(Directory.GetCurrentDirectory(), file);
outputFile = file + ".processed.txt";
Console.WriteLine("Processing file " + file);
using (FileStream fs = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
using (BufferedStream bs = new BufferedStream(fs))
using (StreamReader sr = new StreamReader(bs))
{
string chunk = "";
List<string> chunkseparators = new List<string>() { "body", @"\n<!--break-->" };
int chunksdone = 0;
int numlines = 0;
string line;
while ((line = sr.ReadLine()) != null)
{
if (chunkseparators.Contains(("" + line).Trim().ToLower())
|| (numlines > 20 && ("" + line).Trim().ToLower().EndsWith("</p>")))
{
processChunk(chunk);
chunk = "";
numlines = 0;
chunksdone++;
if (chunksdone % 100 == 0)
Console.WriteLine("Processed " + chunksdone + " chunks so far.");
}
else
{
chunk += line + Environment.NewLine;
numlines++;
}
}
Console.WriteLine("Processed " + chunksdone + " chunks!");
Console.ReadLine();
}
}
static void processChunk(string chunk)
{
chunk = chunk.Replace(@"\n", " ");
chunk = chunk.Replace(@"\t", " ");
chunk = Html2Text(chunk, true);
File.AppendAllText(outputFile, chunk.Trim() + Environment.NewLine + Environment.NewLine);
}
static string Html2Text(string source, bool convertEntities)
{
if (source == null)
return "";
try
{
string result;
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
result = source.Replace("\r", " ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace("\n", " ");
// Remove step-formatting
result = result.Replace("\t", string.Empty);
// Remove repeating speces becuase browsers ignore them
result = Regex.Replace(result, @"( )+", " ");
// Remove the header (prepare first by clearing attributes)
result = Regex.Replace(result, @"<( )*head([^>])*>", "<head>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
// remove all scripts (prepare first by clearing attributes)
result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"(<script>).*?(</script>)", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
// remove all styles (prepare first by clearing attributes)
result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, "(<style>).*?(</style>)", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
// insert tabs in spaces of <td> tags
result = Regex.Replace(result, @"<( )*td([^>])*>", "\t", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// insert line breaks in places of <BR>, <DIV>, <TR> and <LI> tags
result = Regex.Replace(result, @"<( )*br( )*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"<( )*div([^>])*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// insert line paragraphs (double line breaks) in place
// if <P> tags
result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// Remove remaining tags like <a>, links, images,
// comments etc - anything thats enclosed inside < >
result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
if (convertEntities)
{
// replace special characters:
result = Regex.Replace(result, @"&nbsp;", " ", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&bull;", " * ", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&lsaquo;", "<", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&rsaquo;", ">", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&trade;", "(tm)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&frasl;", "/", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&lt;", "<", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&gt;", ">", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&copy;", "(c)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, @"&reg;", "(r)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Compiled);
}
// make line breaking consistent
result = result.Replace("\n", "\r");
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces inbetween
// the escaped characters and remove redundant tabs inbetween linebreaks
result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, "(\t)( )+(\t)", "\t\t", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, "(\t)( )+(\r)", "\t\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
result = Regex.Replace(result, "(\r)( )+(\t)", "\r\t", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// Remove redundant tabs
result = Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// Remove multible tabs followind a linebreak with just one tab
result = Regex.Replace(result, "(\r)(\t)+", "\r\t", RegexOptions.IgnoreCase | RegexOptions.Compiled);
// Initial replacement target string for linebreaks
string breaks = "\r\r\r";
// Initial replacement target string for tabs
string tabs = "\t\t\t\t\t";
for (int index = 0; index < result.Length; index++)
{
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
}
// Thats it.
return result;
}
catch
{
return source;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment