Skip to content

Instantly share code, notes, and snippets.

@synhershko
Last active December 17, 2015 20:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save synhershko/5670157 to your computer and use it in GitHub Desktop.
Save synhershko/5670157 to your computer and use it in GitHub Desktop.
1. Install-Package NEST
2. Download and reference crawler binaries: https://code.google.com/p/abot/downloads/detail?name=Abotv1.1.1_Bin.zip&can=2&q=
3. Add the code attached to this gist
4. Have fun
using System;
using Abot.Poco;
namespace Basics
{
public class Page
{
public Page(CrawledPage crawledPage)
{
var doc = crawledPage.HtmlDocument.DocumentNode;
Title = doc.SelectSingleNode("//title").InnerText.Replace(" - Wikipedia, the free encyclopedia", String.Empty);
Content = doc.SelectSingleNode("//body").InnerText;
Uri = crawledPage.Uri.ToString();
ParentUri = crawledPage.ParentUri.ToString();
PageSizeInBytes = crawledPage.PageSizeInBytes;
CrawlDepth = crawledPage.CrawlDepth;
Timestamp = DateTimeOffset.UtcNow;
}
public string Uri { get; set; }
public string ParentUri { get; set; }
public long PageSizeInBytes { get; set; }
public int CrawlDepth { get; set; }
public string Title { get; set; }
public string Content { get; set; }
public DateTimeOffset Timestamp { get; set; }
}
}
using Abot.Crawler;
using Abot.Core;
using Abot.Poco;
namespace Basics
{
class Program
{
static void Main(string[] args)
{
var client = new ElasticClient(new ConnectionSettings(new Uri("http://localhost:9200")).SetDefaultIndex("pages"));
ConnectionStatus connectionStatus;
if (!client.TryConnect(out connectionStatus))
{
Console.WriteLine("Error");
// Error handling
return;
}
Console.WriteLine("Connected to " + client.Settings.Uri);
var crawlConfig = new CrawlConfiguration
{
CrawlTimeoutSeconds = 3600,
MaxConcurrentThreads = 10,
UserAgentString = "abot v1.0 http://code.google.com/p/abot"
};
//Will use the manually created crawlConfig object created above
var crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
crawler.CrawlBag.ElasticClient = client;
var result = crawler.Crawl(new Uri("http://en.wikipedia.org/wiki/Main_Page/"));
if (result.ErrorOccurred)
Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException);
else
Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
// ...
}
static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
Console.WriteLine("About to crawl link {0} which was found on page {1}", pageToCrawl.Uri.AbsoluteUri, pageToCrawl.ParentUri.AbsoluteUri);
}
static void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
{
CrawledPage crawledPage = e.CrawledPage;
if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
else
Console.WriteLine("Crawl of page succeeded {0}", crawledPage.Uri.AbsoluteUri);
if (string.IsNullOrEmpty(crawledPage.RawContent))
Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
var doc = e.CrawledPage.HtmlDocument.DocumentNode;
ElasticClient client = e.CrawlContext.CrawlBag.ElasticClient;
// TODO
}
static void crawler_PageLinksCrawlDisallowed(object sender, PageLinksCrawlDisallowedArgs e)
{
CrawledPage crawledPage = e.CrawledPage;
Console.WriteLine("Did not crawl the links on page {0} due to {1}", crawledPage.Uri.AbsoluteUri, e.DisallowedReason);
}
static void crawler_PageCrawlDisallowed(object sender, PageCrawlDisallowedArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
Console.WriteLine("Did not crawl page {0} due to {1}", pageToCrawl.Uri.AbsoluteUri, e.DisallowedReason);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment