Skip to content

Instantly share code, notes, and snippets.

@yetanotherchris
Created February 14, 2013 23:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yetanotherchris/4957320 to your computer and use it in GitHub Desktop.
Save yetanotherchris/4957320 to your computer and use it in GitHub Desktop.
Extract all hyperlinks from HTML page
class Program
{
static void Main(string[] args)
{
// Example usage:
WebClient client = new WebClient();
byte[] buffer = client.DownloadData("http://www.yahoo.jp");
// GetString() extension method is from:
// http://www.shrinkrays.net/code-snippets/csharp/an-extension-method-for-converting-a-byte-array-to-a-string.aspx
string html = buffer.GetString();
List<string> list = LinkExtractor.Extract(html);
foreach (var link in list)
{
Console.WriteLine(link);
}
Console.ReadLine();
}
}
public class LinkExtractor
{
/// <summary>
/// Extracts all src and href links from a HTML string.
/// </summary>
/// <param name="html">The html source</param>
/// <returns>A list of links - these will be all links including javascript ones.</returns>
public static List<string> Extract(string html)
{
List<string> list = new List<string>();
Regex regex = new Regex("(?:href|src)=[\"|']?(.*?)[\"|'|>]+", RegexOptions.Singleline | RegexOptions.CultureInvariant);
if (regex.IsMatch(html))
{
foreach (Match match in regex.Matches(html))
{
list.Add(match.Groups[1].Value);
}
}
return list;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment