Skip to content

Instantly share code, notes, and snippets.

@yemrekeskin
Created May 15, 2014 07:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yemrekeskin/b1e362e32eb0cf5a8b49 to your computer and use it in GitHub Desktop.
Save yemrekeskin/b1e362e32eb0cf5a8b49 to your computer and use it in GitHub Desktop.
SimpleWebCrawler with 2 node deep - console app
class Program
{
static void Main(string[] args)
{
LinkCrawler crawler = new LinkCrawler();
List<string> list = crawler.Catch("http://blog.yemrekeskin.com/en/");
foreach (var item in list)
{
Console.WriteLine(item);
List<string> nestedList = crawler.Catch(item);
foreach (var nestedItem in nestedList)
{
Console.WriteLine("----" +nestedItem);
}
}
Console.ReadLine();
}
}
public interface ILinkCrawler
{
List<string> Catch(string link);
}
public class LinkCrawler
: ILinkCrawler
{
public List<string> Catch(string link)
{
if (String.IsNullOrEmpty(link))
throw new ApplicationException("");
List<string> rl = new List<string>();
if (link.IsImage())
return rl;
StringHelper.CheckLinkControl(ref link);
WebRequest wr = WebRequest.Create(link);
StreamReader sr = null;
WebResponse ws = null;
string response = String.Empty;
try
{
ws = wr.GetResponse();
sr = new StreamReader(ws.GetResponseStream(), Encoding.UTF8);
response = sr.ReadToEnd();
}
catch (Exception ex)
{
//throw new ApplicationException(ex.Message);
}
finally
{
if (sr != null) sr.Close();
if (ws != null) ws.Close();
}
Regex r = new Regex("<a.+href=\"http.+://(.+)\">(.*)</a>");
MatchCollection mc = r.Matches(response);
string dummyLink = string.Empty;
foreach (Match m in mc)
{
dummyLink = m.Groups[1].Value;
if (dummyLink.IndexOf("\"") > -1)
dummyLink = dummyLink.Substring(0, dummyLink.IndexOf("\""));
rl.Add(dummyLink);
dummyLink = string.Empty;
}
return rl;
}
}
public static class ImageExtention
{
public static readonly List<string> ImageExtensions = new List<string> { ".JPG", ".JPE", ".BMP", ".GIF", ".PNG" };
public static bool IsImage(this string url)
{
return ImageExtensions.Contains(Path.GetExtension(url).ToUpperInvariant());
}
}
public static class StringHelper
{
public static void CheckLinkControl(ref string link)
{
string web = "http://";
if (link.Substring(0, 7) != web)
link = link.Insert(0, web);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment