Skip to content

Instantly share code, notes, and snippets.

Created October 16, 2012 23:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/3902673 to your computer and use it in GitHub Desktop.
Save anonymous/3902673 to your computer and use it in GitHub Desktop.
metro c# webscraper with HtmlAgilityPack
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using Windows.UI;
using Windows.UI.Xaml;
using Windows.UI.Xaml.Controls;
using Windows.UI.Xaml.Media;
namespace ReaderModeTest1.Common
{
class WebViewReaderMode
{
static WebView wv;
public static string GetUrl(DependencyObject obj)
{
return (string)obj.GetValue(UrlProperty);
}
public static void SetUrl(DependencyObject obj, string value)
{
obj.SetValue(UrlProperty, value);
}
public static readonly DependencyProperty UrlProperty =
DependencyProperty.RegisterAttached("Url", typeof(string), typeof(WebViewReaderMode), new PropertyMetadata("", OnUrlChanged));
private static void OnUrlChanged(DependencyObject sender, DependencyPropertyChangedEventArgs eventArgs)
{
wv = sender as WebView;
scrapeWebContentAsync(eventArgs.NewValue as string);
}
public static string GetHeader(DependencyObject obj)
{
return (string)obj.GetValue(HeaderProperty);
}
public static void SetHeader(DependencyObject obj, string value)
{
obj.SetValue(HeaderProperty, value);
}
public static readonly DependencyProperty HeaderProperty =
DependencyProperty.RegisterAttached("Header", typeof(string), typeof(WebViewReaderMode), new PropertyMetadata("", OnHeaderChanged));
private static void OnHeaderChanged(DependencyObject sender, DependencyPropertyChangedEventArgs eventArgs)
{
}
private static async void scrapeWebContentAsync(string url)
{
var htmlWeb = new HtmlWeb();
var doc = await htmlWeb.LoadFromWebAsync(url);
doc.OptionOutputAsXml = true;
doc.OptionFixNestedTags = true;
doc.OptionAutoCloseOnEnd = true;
var wordsPerNode = new Dictionary<HtmlNode, Double>();
var topRatio = 1000000.0;
HtmlNode articleNode = null;
if (doc.DocumentNode != null)
{
doc.DocumentNode.InnerHtml = Regex.Replace(doc.DocumentNode.InnerHtml, @"[\\\n\t\r]+", "");
foreach (var n in walkDom(doc.DocumentNode).ToList())
{
if (n.NodeType == HtmlNodeType.Comment) n.Remove();
if (n.Name == "script") n.Remove();
}
var nodes = walkDom(doc.DocumentNode).OfType<HtmlNode>().ToArray();
foreach (var n in nodes)
{
if (n.Name == "div")
{
var len = WCforNode(n);
if (len > 250 && getElementsofType(n, "p").Count > 3)
{
var linkCount = getLinkWC(n);
if (linkCount == 0) linkCount = 5;
var ratio = Convert.ToDouble(linkCount) / len;
if (ratio < topRatio)
{
topRatio = ratio;
articleNode = n;
}
}
}
}
if (articleNode != null)
{
var titleText = getElementsofType(doc.DocumentNode, "title").FirstOrDefault().InnerText;
var headerText = getTitlefromElement(articleNode, true);
if (headerText == "") headerText = getTitlefromElement(articleNode.PreviousSibling);
if (headerText == "") headerText = getTitlefromElement(articleNode.ParentNode);
if (headerText == "") headerText = getTitlefromElement(articleNode.ParentNode.ParentNode);
if (headerText == "") headerText = getTitlefromElement(articleNode.ParentNode.ParentNode.ParentNode);
SetHeader(wv, headerText);
var tr = new StringWriter();
var cleandoc = new HtmlDocument();
cleandoc.OptionOutputAsXml = true;
cleandoc.OptionFixNestedTags = true;
cleandoc.OptionAutoCloseOnEnd = true;
cleandoc.LoadHtml(articleNode.OuterHtml);
cleandoc.Save(tr);
var content = tr.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "");
var forgroundColor = color2Rbga((App.Current.Resources["ApplicationForegroundThemeBrush"] as SolidColorBrush).Color);
var backgroundColor = color2Rbga((App.Current.Resources["ApplicationPageBackgroundThemeBrush"] as SolidColorBrush).Color);
var fontFamily = (App.Current.Resources["ContentControlThemeFontFamily"] as FontFamily).Source;
var fontSize = App.Current.Resources["ControlContentThemeFontSize"] as double?;
content = "<style>*{font-family: Calibri, sans-serif;font-size:" + fontSize + "pt;background-color:" + backgroundColor + ";color:" + forgroundColor + ";}</style>" + content;
wv.NavigateToString(content);
}
}
}
private static string color2Rbga(Color col)
{
return " rgba(" +col.R+ "," +col.G +"," +col.B +"," +col.A +") ";
}
private static IEnumerable<HtmlNode> walkDom(HtmlNode node)
{
yield return node;
foreach (var child in node.ChildNodes)
foreach (var x in walkDom(child))
yield return x;
}
private static string getTitlefromElement(HtmlNode node, bool removeEle = false)
{
var headerText = "";
var articleH1 = getElementsofType(node, "h1");
if (articleH1.Count > 0 && articleH1.FirstOrDefault().InnerText.Length>15)
{
headerText = articleH1.FirstOrDefault().InnerText;
if(removeEle) articleH1.FirstOrDefault().Remove();
}
if (headerText == "")
{
var articleH2 = getElementsofType(node, "h2");
if (articleH2.Count > 0 && articleH2.FirstOrDefault().InnerText.Length>15)
{
headerText = articleH2.FirstOrDefault().InnerText;
if (removeEle) articleH2.FirstOrDefault().Remove();
}
}
if (headerText == "")
{
var articleH3 = getElementsofType(node, "h3");
if (articleH3.Count > 0 && articleH3.FirstOrDefault().InnerText.Length > 15)
{
headerText = articleH3.FirstOrDefault().InnerText;
if (removeEle) articleH3.FirstOrDefault().Remove();
}
}
return headerText;
}
private static int WCforNode(HtmlNode node)
{
return getElementsofType(node, "#text").SelectMany(segment => Regex.Split(segment.InnerText, "\\w")).Count();
}
private static double AvgWCforNode(HtmlNode node, string elementType)
{
return getElementsofType(node, elementType).Select(segment => Regex.Split(segment.InnerText, "\\w").Count()).Average();
}
private static int getLinkWC(HtmlNode node)
{
var links = getElementsofType(node, "a" );
var WClink=0;
foreach (var link in links)
{
WClink+=WCforNode(link);
}
return WClink;
}
private static List<HtmlNode> getElementsofType(HtmlNode node, string elementType)
{
return (walkDom(node).ToList()).Where(x => x.Name == elementType).ToList();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment