Skip to content

Instantly share code, notes, and snippets.

Created August 2, 2009 12:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JeffreyZhao/160043 to your computer and use it in GitHub Desktop.
Save JeffreyZhao/160043 to your computer and use it in GitHub Desktop.
namespace Crawling
using System;
using System.Linq;
using System.Collections.Generic;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using ActorLite;
static class Program
static void Main(string[] args)
var monitor = new Monitor(5);
monitor.Post(m => m.Crawl(""));
TestStatisticPort testPort = new TestStatisticPort(monitor);
public class TestStatisticPort : IPort<IStatisticResponseHandler>, IStatisticResponseHandler
private IPort<IStatisticRequestHandelr> m_statisticPort;
public TestStatisticPort(IPort<IStatisticRequestHandelr> statisticPort)
this.m_statisticPort = statisticPort;
public void Start()
while (true)
this.m_statisticPort.Post(s => s.GetCrawledCount(this));
#region IPort<IStatisticResponseHandler> Members
void IPort<IStatisticResponseHandler>.Post(Action<IStatisticResponseHandler> message)
#region IStatisticResponseHandler Members
void IStatisticResponseHandler.ReplyCrawledCount(int count)
Console.WriteLine("Crawled: {0}", count);
void IStatisticResponseHandler.ReplyContent(string url, string content)
throw new NotImplementedException();
public interface IPort<out T>
void Post(Action<T> message);
internal interface ICrawlRequestHandler
void Crawl(IPort<ICrawlResponseHandler> collector, string url);
internal interface ICrawlResponseHandler
void Succeeded(IPort<ICrawlRequestHandler> crawler, string url, string content, List<string> links);
void Failed(IPort<ICrawlRequestHandler> crawler, string url, Exception ex);
public interface IStatisticRequestHandelr
void GetCrawledCount(IPort<IStatisticResponseHandler> requester);
void GetContent(IPort<IStatisticResponseHandler> requester, string url);
public interface IStatisticResponseHandler
void ReplyCrawledCount(int count);
void ReplyContent(string url, string content);
internal class Crawler : Actor<Action<Crawler>>, IPort<Crawler>, ICrawlRequestHandler
protected override void Receive(Action<Crawler> message) { message(this); }
#region ICrawlRequestHandler Members
void ICrawlRequestHandler.Crawl(IPort<ICrawlResponseHandler> collector, string url)
WebClient client = new WebClient();
client.DownloadStringCompleted += (sender, e) =>
if (e.Error == null)
this.Post(c => c.Crawled(collector, url, e.Result));
collector.Post(c => c.Failed(this, url, e.Error));
client.DownloadStringAsync(new Uri(url));
private void Crawled(IPort<ICrawlResponseHandler> collector, string url, string content)
var matches = Regex.Matches(content, @"href=""(http://[^""]+)""").Cast<Match>();
var links = matches.Select(m => m.Groups[1].Value).Distinct().ToList();
collector.Post(c => c.Succeeded(this, url, content, links));
public class Monitor : Actor<Action<Monitor>>, IPort<Monitor>,
protected override void Receive(Action<Monitor> message) { message(this); }
private HashSet<string> m_allUrls;
private Queue<string> m_readyToCrawl;
private Dictionary<string, string> m_urlContent;
public int MaxCrawlerCount { private set; get; }
public int WorkingCrawlerCount { private set; get; }
public Monitor(int crawlerCount)
this.m_allUrls = new HashSet<string>();
this.m_readyToCrawl = new Queue<string>();
this.m_urlContent = new Dictionary<string, string>();
this.MaxCrawlerCount = crawlerCount;
this.WorkingCrawlerCount = 0;
public void Crawl(string url)
if (this.m_allUrls.Contains(url)) return;
if (this.WorkingCrawlerCount < this.MaxCrawlerCount)
IPort<ICrawlRequestHandler> crawler = new Crawler();
crawler.Post(c => c.Crawl(this, url));
#region ICrawlResponseHandler Members
void ICrawlResponseHandler.Succeeded(IPort<ICrawlRequestHandler> crawler, string url, string content, List<string> links)
this.m_urlContent[url] = content;
Console.WriteLine("{0} crawled, {1} link(s).", url, links.Count);
foreach (var newUrl in links)
if (!this.m_allUrls.Contains(newUrl))
void ICrawlResponseHandler.Failed(IPort<ICrawlRequestHandler> crawler, string url, Exception ex)
Console.WriteLine("{0} error occurred: {1}.", url, ex.Message);
private void DispatchCrawlingTasks(IPort<ICrawlRequestHandler> reusableCrawler)
if (this.m_readyToCrawl.Count <= 0)
var url = this.m_readyToCrawl.Dequeue();
reusableCrawler.Post(c => c.Crawl(this, url));
while (this.m_readyToCrawl.Count > 0 &&
this.WorkingCrawlerCount < this.MaxCrawlerCount)
var newUrl = this.m_readyToCrawl.Dequeue();
IPort<ICrawlRequestHandler> crawler = new Crawler();
crawler.Post(c => c.Crawl(this, newUrl));
#region IStatisticRequestHandelr Members
void IStatisticRequestHandelr.GetCrawledCount(IPort<IStatisticResponseHandler> requester)
requester.Post(r => r.ReplyCrawledCount(this.m_urlContent.Count));
void IStatisticRequestHandelr.GetContent(IPort<IStatisticResponseHandler> requester, string url)
string content;
if (!this.m_urlContent.TryGetValue(url, out content))
content = null;
requester.Post(r => r.ReplyContent(url, content));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment