Created
July 15, 2009 16:47
-
-
Save JeffreyZhao/147835 to your computer and use it in GitHub Desktop.
A simple web crawler based on ActorLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Crawling | |
{ | |
public class Crawler : Actor<Message>, ICrawlingHandler | |
{ | |
protected override void Receive(Message message) | |
{ | |
message(this); | |
} | |
public Crawler(Monitor monitor) | |
{ | |
this.m_monitor = monitor; | |
} | |
private Monitor m_monitor; | |
#region ICrawlingHandler Members | |
void ICrawlingHandler.Crawl(string url) | |
{ | |
WebClient client = new WebClient() { Encoding = Encoding.UTF8 }; | |
client.DownloadStringCompleted += (sender, e) => | |
{ | |
if (e.Error == null) | |
{ | |
this.Post(h => ((ICrawlingHandler)h).Crawled(url, e.Result)); | |
} | |
else | |
{ | |
this.m_monitor.Post(h => ((IMonitoringHandler)h).Collect( | |
this, url, Enumerable.Empty<string>())); | |
} | |
}; | |
client.DownloadStringAsync(new Uri(url)); | |
} | |
void ICrawlingHandler.Crawled(string url, string content) | |
{ | |
var matches = Regex.Matches(content, @"href=""(http://[^""]+)""").Cast<Match>(); | |
var links = matches.Select(m => m.Groups[1].Value).Distinct().ToList(); | |
Console.WriteLine("{0} crawled, {1} link(s).", url, links.Count); | |
this.m_monitor.Post(h => ((IMonitoringHandler)h).Collect(this, url, links)); | |
} | |
#endregion | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Crawling | |
{ | |
public delegate void Message(object handler); | |
public interface ICrawlingHandler | |
{ | |
void Crawl(string url); | |
void Crawled(string url, string content); | |
} | |
public interface IMonitoringHandler | |
{ | |
void Dispatch(string url); | |
void Collect(Crawler crawler, string url, IEnumerable<string> links); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Crawling | |
{ | |
public class Monitor : Actor<Message>, IMonitoringHandler | |
{ | |
protected override void Receive(Message message) | |
{ | |
message(this); | |
} | |
private HashSet<string> m_crawled = new HashSet<string>(); | |
private Queue<string> m_readyToCrawl = new Queue<string>(); | |
private Stack<Crawler> m_crawlers = new Stack<Crawler>(); | |
public Monitor(int crawlerCount) | |
{ | |
for (int i = 0; i < crawlerCount; i++) | |
{ | |
this.m_crawlers.Push(new Crawler(this)); | |
} | |
} | |
public void Crawl(string url) | |
{ | |
this.Post(h => ((IMonitoringHandler)h).Dispatch(url)); | |
} | |
#region ICollectHandler Members | |
void IMonitoringHandler.Dispatch(string url) | |
{ | |
if (this.m_crawled.Contains(url)) return; | |
if (this.m_crawlers.Count > 0) | |
{ | |
var crawler = this.m_crawlers.Pop(); | |
crawler.Post(h => ((ICrawlingHandler)h).Crawl(url)); | |
} | |
else | |
{ | |
this.m_readyToCrawl.Enqueue(url); | |
} | |
} | |
void IMonitoringHandler.Collect(Crawler crawler, string url, IEnumerable<string> links) | |
{ | |
this.m_crawlers.Push(crawler); | |
this.m_crawled.Add(url); | |
foreach (var newUrl in links) | |
{ | |
if (this.m_crawled.Contains(newUrl)) continue; | |
this.m_readyToCrawl.Enqueue(newUrl); | |
} | |
while (this.m_crawlers.Count > 0 && this.m_readyToCrawl.Count > 0) | |
{ | |
var urlToCrawl = this.m_readyToCrawl.Dequeue(); | |
if (this.m_crawled.Contains(urlToCrawl)) continue; | |
var nextCrawler = this.m_crawlers.Pop(); | |
crawler.Post(h => ((ICrawlingHandler)h).Crawl(urlToCrawl)); | |
} | |
} | |
#endregion | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment