Skip to content

Instantly share code, notes, and snippets.

@HristoKolev
Created November 7, 2016 20:11
Show Gist options
  • Save HristoKolev/a4fe84c6c04ee82a3baefaa9a2700e18 to your computer and use it in GitHub Desktop.
Save HristoKolev/a4fe84c6c04ee82a3baefaa9a2700e18 to your computer and use it in GitHub Desktop.
namespace HtmlScrubber
{
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using Fizzler.Systems.HtmlAgilityPack;
using HtmlAgilityPack;
public class LinkObject
{
public string HeaderId { get; set; }
public string Title { get; set; }
}
// install Fizzler.Systems.HtmlAgilityPack
// install HtmlAgilityPack 1.4.9
internal class Program
{
private static readonly Regex EverythingExceptWordCharactersRegex = new Regex("[^A-z0-9-]+");
private static readonly Regex WhiteSpaceRegex = new Regex(@"\s+");
public static string Base64Encode(string plainText)
{
var plainTextBytes = Encoding.UTF8.GetBytes(plainText);
return Convert.ToBase64String(plainTextBytes);
}
private static string ConvertToValidId(string header)
{
string value = WhiteSpaceRegex.Replace(header, "-");
value = EverythingExceptWordCharactersRegex.Replace(value, string.Empty);
value = value.ToLower();
return value;
}
private static void Main(string[] args)
{
string content = File.ReadAllText(@"C:\Users\hristo.kolev\Desktop\source.txt");
var links = new List<LinkObject>();
var document = new HtmlDocument();
document.LoadHtml2(content);
var root = document.DocumentNode;
var headers = root.QuerySelectorAll("h1");
foreach (var header in headers)
{
var link = new LinkObject
{
Title = header.InnerText,
HeaderId = ConvertToValidId(header.InnerText)
};
links.Add(link);
header.SetAttributeValue("id", link.HeaderId);
}
foreach (var link in links)
{
Console.WriteLine($"{link.Title}, {link.HeaderId}");
}
File.WriteAllText(@"C:\Users\hristo.kolev\Desktop\target.txt", root.InnerHtml);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment