Skip to content

Instantly share code, notes, and snippets.

@simonmurdock
Last active February 23, 2021 18:30
Show Gist options
  • Save simonmurdock/af265f6eb4530b9506b211f2118c486a to your computer and use it in GitHub Desktop.
Save simonmurdock/af265f6eb4530b9506b211f2118c486a to your computer and use it in GitHub Desktop.
scraper
using System;
using System.Diagnostics;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net.Http;
using HtmlAgilityPack;
namespace scraper
{
enum AssetType
{
Document,
Image,
Link,
Script
}
class Asset
{
public string ParentUrl { get; set; }
public string Url { get; set; }
public AssetType Type { get; set; }
public bool Done { get; set; }
public bool Ignored { get; set; }
}
class Program
{
static void Main(string[] args)
{
var s = new Scraper();
s.Scrape("http://192.168.30.1:8080/php/");
while (s.Assets.Any(x => !x.Done && x.Type == AssetType.Document && !x.Ignored))
{
s.CheckForUnscrapedUrls();
}
s.PrintAsets();
Console.ReadKey();
}
}
class Scraper
{
private Uri baseUrl = new Uri("http://192.168.30.1:8080/php/");
private Uri baseAddress = new Uri("http://192.168.30.1:8080");
private HttpClient Client;
public IList<Asset> Assets { get; set; }
private Stopwatch Stopwatch;
private int scrapes = 0;
public Scraper()
{
Assets = new List<Asset>();
Stopwatch = new Stopwatch();
Client = new HttpClient()
{
BaseAddress = baseAddress
};
}
public void Scrape(string target)
{
scrapes++;
Stopwatch.Start();
if (target.Contains("#"))
{
target = target.Split('#')[0];
}
if (!TargetIsValid(target))
{
var asset = new Asset() { Url = target, Type = AssetType.Document, Done = true, Ignored = true };
AddToAssetsCollection(asset);
return;
}
string contents = GetTargetContents(target);
if (scrapes % 100 == 0)
{
Console.WriteLine(".");
}
var doc = new HtmlDocument();
doc.LoadHtml(contents);
var hrefNodes = doc.DocumentNode.SelectNodes("//a[@href]");
if (hrefNodes != null)
{
foreach (HtmlNode link in hrefNodes)
{
var relative = link.Attributes.FirstOrDefault(x => x.Name == "href").Value;
var abs = new Uri(baseUrl, relative);
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Document, ParentUrl = target };
AddToAssetsCollection(asset);
}
}
var imgNodes = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgNodes != null)
{
foreach (HtmlNode link in imgNodes)
{
var href = link.Attributes.FirstOrDefault(x => x.Name == "src").Value;
var abs = new Uri(baseUrl, href);
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Image, ParentUrl = target, Done = true };
AddToAssetsCollection(asset);
}
}
var linkNodes = doc.DocumentNode.SelectNodes("//link[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes.FirstOrDefault(x => x.Name == "href").Value;
var abs = new Uri(baseUrl, href);
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Link, ParentUrl = target, Done = true };
AddToAssetsCollection(asset);
}
}
var scriptNodes = doc.DocumentNode.SelectNodes("//script[@src]");
if (scriptNodes != null)
{
foreach (HtmlNode link in scriptNodes)
{
var href = link.Attributes.FirstOrDefault(x => x.Name == "src").Value;
var abs = new Uri(baseUrl, href);
var asset = new Asset() { Url = abs.AbsoluteUri, Type = AssetType.Script, ParentUrl = target, Done = true };
AddToAssetsCollection(asset);
}
}
}
private bool TargetIsValid(string target)
{
if (!target.Contains(baseUrl.ToString())) return false;
if (target.StartsWith("#")) return false;
if (target.StartsWith("mailto:")) return false;
if (target.StartsWith("data:")) return false;
if (target.StartsWith("/")) return true;
return true;
}
private void AddToAssetsCollection(Asset asset)
{
if (!Assets.Any(x => x.Url == asset.Url))
{
Assets.Add(asset);
}
}
public void CheckForUnscrapedUrls()
{
var todo = Assets.FirstOrDefault(x => !x.Done && x.Type == AssetType.Document && !x.Ignored);
if (todo == null) return;
Scrape(todo.Url);
todo.Done = true;
}
private string GetTargetContents(string target)
{
var response = Client.GetAsync(target).Result;
var contents = response.Content.ReadAsStringAsync().Result;
return contents;
}
internal void PrintAsets()
{
var lines = new List<string>();
Stopwatch.Stop();
// foreach doc one
foreach (var doc in Assets.Where(x => x.Type == AssetType.Document && !x.Ignored))
{
//Console.WriteLine(doc.Type + " > " + doc.Url);
lines.Add(doc.Type + " > " + doc.Url);
foreach (var thing in Assets.Where(x => x.ParentUrl == doc.Url && !x.Ignored))
{
//Console.WriteLine(" " + thing.Type + " > " + thing.Url);
lines.Add(" " + thing.Type + " > " + thing.Url);
}
}
System.IO.StreamWriter file = new System.IO.StreamWriter(@"C:\Users\SImon\Desktop\test.txt");
foreach (var l in lines)
{
file.WriteLine(l);
}
file.Close();
Console.WriteLine("-------------------");
Console.WriteLine("Scrape took " + Stopwatch.ElapsedMilliseconds + "ms");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment