Skip to content

Instantly share code, notes, and snippets.

@JeffreyZhao
Created July 15, 2009 16:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JeffreyZhao/147835 to your computer and use it in GitHub Desktop.
Save JeffreyZhao/147835 to your computer and use it in GitHub Desktop.
A simple web crawler based on ActorLite
namespace Crawling
{
public class Crawler : Actor<Message>, ICrawlingHandler
{
protected override void Receive(Message message)
{
message(this);
}
public Crawler(Monitor monitor)
{
this.m_monitor = monitor;
}
private Monitor m_monitor;
#region ICrawlingHandler Members
void ICrawlingHandler.Crawl(string url)
{
WebClient client = new WebClient() { Encoding = Encoding.UTF8 };
client.DownloadStringCompleted += (sender, e) =>
{
if (e.Error == null)
{
this.Post(h => ((ICrawlingHandler)h).Crawled(url, e.Result));
}
else
{
this.m_monitor.Post(h => ((IMonitoringHandler)h).Collect(
this, url, Enumerable.Empty<string>()));
}
};
client.DownloadStringAsync(new Uri(url));
}
void ICrawlingHandler.Crawled(string url, string content)
{
var matches = Regex.Matches(content, @"href=""(http://[^""]+)""").Cast<Match>();
var links = matches.Select(m => m.Groups[1].Value).Distinct().ToList();
Console.WriteLine("{0} crawled, {1} link(s).", url, links.Count);
this.m_monitor.Post(h => ((IMonitoringHandler)h).Collect(this, url, links));
}
#endregion
}
}
namespace Crawling
{
public delegate void Message(object handler);
public interface ICrawlingHandler
{
void Crawl(string url);
void Crawled(string url, string content);
}
public interface IMonitoringHandler
{
void Dispatch(string url);
void Collect(Crawler crawler, string url, IEnumerable<string> links);
}
}
namespace Crawling
{
public class Monitor : Actor<Message>, IMonitoringHandler
{
protected override void Receive(Message message)
{
message(this);
}
private HashSet<string> m_crawled = new HashSet<string>();
private Queue<string> m_readyToCrawl = new Queue<string>();
private Stack<Crawler> m_crawlers = new Stack<Crawler>();
public Monitor(int crawlerCount)
{
for (int i = 0; i < crawlerCount; i++)
{
this.m_crawlers.Push(new Crawler(this));
}
}
public void Crawl(string url)
{
this.Post(h => ((IMonitoringHandler)h).Dispatch(url));
}
#region ICollectHandler Members
void IMonitoringHandler.Dispatch(string url)
{
if (this.m_crawled.Contains(url)) return;
if (this.m_crawlers.Count > 0)
{
var crawler = this.m_crawlers.Pop();
crawler.Post(h => ((ICrawlingHandler)h).Crawl(url));
}
else
{
this.m_readyToCrawl.Enqueue(url);
}
}
void IMonitoringHandler.Collect(Crawler crawler, string url, IEnumerable<string> links)
{
this.m_crawlers.Push(crawler);
this.m_crawled.Add(url);
foreach (var newUrl in links)
{
if (this.m_crawled.Contains(newUrl)) continue;
this.m_readyToCrawl.Enqueue(newUrl);
}
while (this.m_crawlers.Count > 0 && this.m_readyToCrawl.Count > 0)
{
var urlToCrawl = this.m_readyToCrawl.Dequeue();
if (this.m_crawled.Contains(urlToCrawl)) continue;
var nextCrawler = this.m_crawlers.Pop();
crawler.Post(h => ((ICrawlingHandler)h).Crawl(urlToCrawl));
}
}
#endregion
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment