Skip to content

Instantly share code, notes, and snippets.

@daniello
Created June 11, 2009 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save daniello/127916 to your computer and use it in GitHub Desktop.
Save daniello/127916 to your computer and use it in GitHub Desktop.
-module(google_scrapper).
-compile(export_all).
-define(GOOGLE_URL, "http://www.google.co.uk/search?hl=en&btnG=Search&meta=&q=").
start() -> inets:start().
fetch_google_results(Q) ->
% In case of redirect lets erlang take care of this for us
HTTPOptions = [{autoredirect, true}],
% We want binary as a result
Options = [{body_format, binary}],
Headers = [
% Let's be Firefox ;)
{"User-Agent", "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.10) Gecko/2009042315 Firefox/3.0.10"},
{"Accept" , "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"},
% I want the result be UTF-8 encoded
{"Accept-Charset", "utf-8;q=0.7,*;q=0.7"}
],
Request = {?GOOGLE_URL++url_encoder:encode(Q), Headers},
case http:request(get, Request, HTTPOptions, Options) of
{ok, {{"HTTP/1.1",200,"OK"}, _, Body}} -> Body;
{error,Error} -> {error,Error}
end.
parse(B, RE, Fun) ->
case re:run(B, RE, [global, caseless, unicode, dotall, multiline, {capture, all, binary}]) of
{match, Matches} ->
lists:map(
fun(Match) -> Fun(Match) end,
Matches);
nomatch -> []
end.
parse_google_results(B) ->
RE = "<\\!--m-->(.*?)<\\!--n-->",
parse(B, RE, fun parse_google_result/1).
parse_google_result(GResult) ->
RE = "<li class=g.*?<h3.*?<a href=\"(.*?)\".*?>(.*?)</a>",
Fun = fun([_,Href,Name]) ->
{Href,Name}
end,
parse(GResult, RE, Fun).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment