Created
May 15, 2014 07:41
-
-
Save yemrekeskin/b1e362e32eb0cf5a8b49 to your computer and use it in GitHub Desktop.
SimpleWebCrawler with 2 node deep - console app
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
LinkCrawler crawler = new LinkCrawler(); | |
List<string> list = crawler.Catch("http://blog.yemrekeskin.com/en/"); | |
foreach (var item in list) | |
{ | |
Console.WriteLine(item); | |
List<string> nestedList = crawler.Catch(item); | |
foreach (var nestedItem in nestedList) | |
{ | |
Console.WriteLine("----" +nestedItem); | |
} | |
} | |
Console.ReadLine(); | |
} | |
} | |
public interface ILinkCrawler | |
{ | |
List<string> Catch(string link); | |
} | |
public class LinkCrawler | |
: ILinkCrawler | |
{ | |
public List<string> Catch(string link) | |
{ | |
if (String.IsNullOrEmpty(link)) | |
throw new ApplicationException(""); | |
List<string> rl = new List<string>(); | |
if (link.IsImage()) | |
return rl; | |
StringHelper.CheckLinkControl(ref link); | |
WebRequest wr = WebRequest.Create(link); | |
StreamReader sr = null; | |
WebResponse ws = null; | |
string response = String.Empty; | |
try | |
{ | |
ws = wr.GetResponse(); | |
sr = new StreamReader(ws.GetResponseStream(), Encoding.UTF8); | |
response = sr.ReadToEnd(); | |
} | |
catch (Exception ex) | |
{ | |
//throw new ApplicationException(ex.Message); | |
} | |
finally | |
{ | |
if (sr != null) sr.Close(); | |
if (ws != null) ws.Close(); | |
} | |
Regex r = new Regex("<a.+href=\"http.+://(.+)\">(.*)</a>"); | |
MatchCollection mc = r.Matches(response); | |
string dummyLink = string.Empty; | |
foreach (Match m in mc) | |
{ | |
dummyLink = m.Groups[1].Value; | |
if (dummyLink.IndexOf("\"") > -1) | |
dummyLink = dummyLink.Substring(0, dummyLink.IndexOf("\"")); | |
rl.Add(dummyLink); | |
dummyLink = string.Empty; | |
} | |
return rl; | |
} | |
} | |
public static class ImageExtention | |
{ | |
public static readonly List<string> ImageExtensions = new List<string> { ".JPG", ".JPE", ".BMP", ".GIF", ".PNG" }; | |
public static bool IsImage(this string url) | |
{ | |
return ImageExtensions.Contains(Path.GetExtension(url).ToUpperInvariant()); | |
} | |
} | |
public static class StringHelper | |
{ | |
public static void CheckLinkControl(ref string link) | |
{ | |
string web = "http://"; | |
if (link.Substring(0, 7) != web) | |
link = link.Insert(0, web); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment