Skip to content

Instantly share code, notes, and snippets.

@CodeCommissions
Last active January 25, 2022 20:14
Show Gist options
  • Star 9 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save CodeCommissions/43a5aac117873dabcb1c9bdbf98cbd39 to your computer and use it in GitHub Desktop.
Save CodeCommissions/43a5aac117873dabcb1c9bdbf98cbd39 to your computer and use it in GitHub Desktop.
A basic web scraper using AngleSharp. Part of a WPF application.
//Base code courtesy of:
//https://dev.to/rachelsoderberg/create-a-simple-web-scraper-in-c-1l1m
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using AngleSharp.Html.Parser;
using AngleSharp.Text;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net.Http;
using System.Threading;
using System.Windows;
namespace SportsScraper
{
/// <summary>
/// Interaction logic for MainWindow.xaml
/// </summary>
public partial class MainWindow : Window
{
public MainWindow()
{
InitializeComponent();
}
private string Title { get; set; }
private string Url { get; set; }
private string siteUrl = "https://www.oceannetworks.ca/news/stories";
public string[] QueryTerms { get; } = { "Ocean", "Nature", "Pollution" };
private void Button_Click(object sender, RoutedEventArgs e)
{
ScrapeWebsite();
}
internal async void ScrapeWebsite()
{
CancellationTokenSource cancellationToken = new CancellationTokenSource();
HttpClient httpClient = new HttpClient();
HttpResponseMessage request = await httpClient.GetAsync(siteUrl);
cancellationToken.Token.ThrowIfCancellationRequested();
Stream response = await request.Content.ReadAsStreamAsync();
cancellationToken.Token.ThrowIfCancellationRequested();
HtmlParser parser = new HtmlParser();
IHtmlDocument document = parser.ParseDocument(response);
//Add connection between initial scrape, and parsing of results
GetScrapeResults(document);
}
private void GetScrapeResults(IHtmlDocument document)
{
IEnumerable<IElement> articleLink = null;
foreach (var term in QueryTerms)
{
articleLink = document.All.Where(x =>
x.ClassName == "views-field views-field-nothing" &&
(x.ParentElement.InnerHtml.Contains(term) || x.ParentElement.InnerHtml.Contains(term.ToLower()))).Skip(1);
//Overwriting articleLink above means we have to print it's result for all QueryTerms
//Appending to a pre-declared IEnumerable (like a List), could mean taking this out of the main loop.
if (articleLink.Any())
{
PrintResults(articleLink);
}
}
}
public void PrintResults(IEnumerable<IElement> articleLink)
{
//Every element needs to be cleaned and displayed
foreach (var element in articleLink)
{
CleanUpResults(element);
rtb_debugDisplay.AppendText($"{Title} - {Url}{Environment.NewLine}");
}
}
private void CleanUpResults(IElement result)
{
string htmlResult = result.InnerHtml.ReplaceFirst(" <span class=\"field-content\"><div><a href=\"", @"https://www.oceannetworks.ca");
htmlResult = htmlResult.ReplaceFirst("\">", "*");
htmlResult = htmlResult.ReplaceFirst("</a></div>\n<div class=\"article-title-top\">", "-");
htmlResult = htmlResult.ReplaceFirst("</div>\n<hr></span> ", "");
//Seperate the results into our class fields for use in PrintResults()
SplitResults(htmlResult);
}
private void SplitResults(string htmlResult)
{
string[] splitResults = htmlResult.Split('*');
Url = splitResults[0];
Title = splitResults[1];
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment