Skip to content

Instantly share code, notes, and snippets.

@The-Quill
Created January 7, 2016 02:25
Show Gist options
  • Save The-Quill/9d012ea2e72c87e725ac to your computer and use it in GitHub Desktop.
Save The-Quill/9d012ea2e72c87e725ac to your computer and use it in GitHub Desktop.
using System;
using System.ComponentModel.Composition;
using System.Linq;
using System.Text.RegularExpressions;
using DatabaseGateway;
using HtmlAgilityPack;
using MEF.Contract;
using Scraper;
namespace Plugin1
{
[Export(typeof(IPlugin))]
public class StartProcess : IPlugin
{
private const string ScrapeStart = "http://example.com";
private const string DirectoryRegex = "/AllLocations-";
private const string ListingRegex = "Hotel_Review";
private readonly ScraperRules _currentRules;
public string Status { get; set; }
private readonly DatabaseUtils _database;
public string ComponentDescription
{
get { return "example.com"; }
}
public StartProcess()
{
_currentRules = new ScraperRules(DirectoryRegex, ListingRegex);
_database = new DatabaseUtils();
RecursiveLinks(ScrapeStart);
}
public void RecursiveLinks(string directory)
{
var linkDocument = new PageScraper(directory);
HtmlNodeCollection links = linkDocument.FetchLinks();
foreach (string url in links.Select(PageScraper.FetchNodeText))
{
if (string.IsNullOrWhiteSpace(url) || string.IsNullOrEmpty(url)){ return; }
if (Regex.IsMatch(url, _currentRules.DirectoryRegex))
{
RecursiveLinks(url);
}
else if (Regex.IsMatch(url, _currentRules.ListingRegex))
{
PageScraper pageScraperListing = new PageScraper(url);
DataListing listing = new DataListing
{
CustomData1 = Regex.Match(url, "-(d[0-9]+)-").Captures[0].ToString(),
DataProviderId = 0,
DeepLink = string.Empty,
ListingName = pageScraperListing.FetchNode("/h1[@class=HEADING]").InnerText,
Price = string.Empty,
Rating = string.Empty,
IsActive = true,
AddedDate = new DateTime(),
UpdatedDate = new DateTime()
};
_database.AddListing(listing);
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment