Created
October 16, 2012 23:13
-
-
Save anonymous/3902673 to your computer and use it in GitHub Desktop.
metro c# webscraper with HtmlAgilityPack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using HtmlAgilityPack; | |
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
using Windows.UI; | |
using Windows.UI.Xaml; | |
using Windows.UI.Xaml.Controls; | |
using Windows.UI.Xaml.Media; | |
namespace ReaderModeTest1.Common | |
{ | |
class WebViewReaderMode | |
{ | |
static WebView wv; | |
public static string GetUrl(DependencyObject obj) | |
{ | |
return (string)obj.GetValue(UrlProperty); | |
} | |
public static void SetUrl(DependencyObject obj, string value) | |
{ | |
obj.SetValue(UrlProperty, value); | |
} | |
public static readonly DependencyProperty UrlProperty = | |
DependencyProperty.RegisterAttached("Url", typeof(string), typeof(WebViewReaderMode), new PropertyMetadata("", OnUrlChanged)); | |
private static void OnUrlChanged(DependencyObject sender, DependencyPropertyChangedEventArgs eventArgs) | |
{ | |
wv = sender as WebView; | |
scrapeWebContentAsync(eventArgs.NewValue as string); | |
} | |
public static string GetHeader(DependencyObject obj) | |
{ | |
return (string)obj.GetValue(HeaderProperty); | |
} | |
public static void SetHeader(DependencyObject obj, string value) | |
{ | |
obj.SetValue(HeaderProperty, value); | |
} | |
public static readonly DependencyProperty HeaderProperty = | |
DependencyProperty.RegisterAttached("Header", typeof(string), typeof(WebViewReaderMode), new PropertyMetadata("", OnHeaderChanged)); | |
private static void OnHeaderChanged(DependencyObject sender, DependencyPropertyChangedEventArgs eventArgs) | |
{ | |
} | |
private static async void scrapeWebContentAsync(string url) | |
{ | |
var htmlWeb = new HtmlWeb(); | |
var doc = await htmlWeb.LoadFromWebAsync(url); | |
doc.OptionOutputAsXml = true; | |
doc.OptionFixNestedTags = true; | |
doc.OptionAutoCloseOnEnd = true; | |
var wordsPerNode = new Dictionary<HtmlNode, Double>(); | |
var topRatio = 1000000.0; | |
HtmlNode articleNode = null; | |
if (doc.DocumentNode != null) | |
{ | |
doc.DocumentNode.InnerHtml = Regex.Replace(doc.DocumentNode.InnerHtml, @"[\\\n\t\r]+", ""); | |
foreach (var n in walkDom(doc.DocumentNode).ToList()) | |
{ | |
if (n.NodeType == HtmlNodeType.Comment) n.Remove(); | |
if (n.Name == "script") n.Remove(); | |
} | |
var nodes = walkDom(doc.DocumentNode).OfType<HtmlNode>().ToArray(); | |
foreach (var n in nodes) | |
{ | |
if (n.Name == "div") | |
{ | |
var len = WCforNode(n); | |
if (len > 250 && getElementsofType(n, "p").Count > 3) | |
{ | |
var linkCount = getLinkWC(n); | |
if (linkCount == 0) linkCount = 5; | |
var ratio = Convert.ToDouble(linkCount) / len; | |
if (ratio < topRatio) | |
{ | |
topRatio = ratio; | |
articleNode = n; | |
} | |
} | |
} | |
} | |
if (articleNode != null) | |
{ | |
var titleText = getElementsofType(doc.DocumentNode, "title").FirstOrDefault().InnerText; | |
var headerText = getTitlefromElement(articleNode, true); | |
if (headerText == "") headerText = getTitlefromElement(articleNode.PreviousSibling); | |
if (headerText == "") headerText = getTitlefromElement(articleNode.ParentNode); | |
if (headerText == "") headerText = getTitlefromElement(articleNode.ParentNode.ParentNode); | |
if (headerText == "") headerText = getTitlefromElement(articleNode.ParentNode.ParentNode.ParentNode); | |
SetHeader(wv, headerText); | |
var tr = new StringWriter(); | |
var cleandoc = new HtmlDocument(); | |
cleandoc.OptionOutputAsXml = true; | |
cleandoc.OptionFixNestedTags = true; | |
cleandoc.OptionAutoCloseOnEnd = true; | |
cleandoc.LoadHtml(articleNode.OuterHtml); | |
cleandoc.Save(tr); | |
var content = tr.ToString().Replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", ""); | |
var forgroundColor = color2Rbga((App.Current.Resources["ApplicationForegroundThemeBrush"] as SolidColorBrush).Color); | |
var backgroundColor = color2Rbga((App.Current.Resources["ApplicationPageBackgroundThemeBrush"] as SolidColorBrush).Color); | |
var fontFamily = (App.Current.Resources["ContentControlThemeFontFamily"] as FontFamily).Source; | |
var fontSize = App.Current.Resources["ControlContentThemeFontSize"] as double?; | |
content = "<style>*{font-family: Calibri, sans-serif;font-size:" + fontSize + "pt;background-color:" + backgroundColor + ";color:" + forgroundColor + ";}</style>" + content; | |
wv.NavigateToString(content); | |
} | |
} | |
} | |
private static string color2Rbga(Color col) | |
{ | |
return " rgba(" +col.R+ "," +col.G +"," +col.B +"," +col.A +") "; | |
} | |
private static IEnumerable<HtmlNode> walkDom(HtmlNode node) | |
{ | |
yield return node; | |
foreach (var child in node.ChildNodes) | |
foreach (var x in walkDom(child)) | |
yield return x; | |
} | |
private static string getTitlefromElement(HtmlNode node, bool removeEle = false) | |
{ | |
var headerText = ""; | |
var articleH1 = getElementsofType(node, "h1"); | |
if (articleH1.Count > 0 && articleH1.FirstOrDefault().InnerText.Length>15) | |
{ | |
headerText = articleH1.FirstOrDefault().InnerText; | |
if(removeEle) articleH1.FirstOrDefault().Remove(); | |
} | |
if (headerText == "") | |
{ | |
var articleH2 = getElementsofType(node, "h2"); | |
if (articleH2.Count > 0 && articleH2.FirstOrDefault().InnerText.Length>15) | |
{ | |
headerText = articleH2.FirstOrDefault().InnerText; | |
if (removeEle) articleH2.FirstOrDefault().Remove(); | |
} | |
} | |
if (headerText == "") | |
{ | |
var articleH3 = getElementsofType(node, "h3"); | |
if (articleH3.Count > 0 && articleH3.FirstOrDefault().InnerText.Length > 15) | |
{ | |
headerText = articleH3.FirstOrDefault().InnerText; | |
if (removeEle) articleH3.FirstOrDefault().Remove(); | |
} | |
} | |
return headerText; | |
} | |
private static int WCforNode(HtmlNode node) | |
{ | |
return getElementsofType(node, "#text").SelectMany(segment => Regex.Split(segment.InnerText, "\\w")).Count(); | |
} | |
private static double AvgWCforNode(HtmlNode node, string elementType) | |
{ | |
return getElementsofType(node, elementType).Select(segment => Regex.Split(segment.InnerText, "\\w").Count()).Average(); | |
} | |
private static int getLinkWC(HtmlNode node) | |
{ | |
var links = getElementsofType(node, "a" ); | |
var WClink=0; | |
foreach (var link in links) | |
{ | |
WClink+=WCforNode(link); | |
} | |
return WClink; | |
} | |
private static List<HtmlNode> getElementsofType(HtmlNode node, string elementType) | |
{ | |
return (walkDom(node).ToList()).Where(x => x.Name == elementType).ToList(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment