Skip to content

Instantly share code, notes, and snippets.

@aprimc
Created September 26, 2012 18:24
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aprimc/3789675 to your computer and use it in GitHub Desktop.
Save aprimc/3789675 to your computer and use it in GitHub Desktop.
robots.txt parser for Erlang
%% @author Andrej Primc
%% @copyright 2011 Andrej Primc
%% @doc Utility for parsing robots.txt file.
-module(robots_txt).
-export([parse/1, parse/2, is_allowed/2, is_allowed/3]).
%% @spec parse(Text, Http_response_code) -> directives()
%% @doc Parse text and HTTP response code into a list of directives.
%%
parse(_Text, 404) ->
[]; % not found -> allow
parse(_Text, Code) when Code >= 500 ->
[]; % error -> allow
parse(_Text, Code) when Code >= 400 ->
['Disallow']; % 40x -> dissallow
parse(Text, _Code) ->
parse(Text).
%% @spec parse(Text) -> directives()
%% @doc Parse text into a list of directives.
%%
parse(Text) ->
% split lines
Lines = lists:map(fun string:strip/1, string:tokens(Text, "\n\r")),
parse_lines(Lines, []).
% match first directive
%
parse_lines([], Dirs) ->
lists:reverse(Dirs);
parse_lines([""|Lines], Dirs) ->
% skip empty
parse_lines(Lines, Dirs);
parse_lines([[$#|_]|Lines], Dirs) ->
% skip comments
parse_lines(Lines, Dirs);
parse_lines([Line|Lines], Dirs) ->
% try to match "Directive: Value"
{D, P, R} = mochiweb_util:partition(Line, ":"),
case P of
":" ->
Dl = string:to_lower(string:strip(D)),
Rs = string:strip(R),
case Dl of
"user-agent" ->
parse_lines(Lines, [{'User-agent', Rs}|Dirs]);
"disallow" ->
parse_lines(Lines, [{'Disallow', Rs}|Dirs]);
_ ->
parse_lines(Lines, Dirs)
end;
_ ->
parse_lines(Lines, Dirs)
end.
%% @spec is_allowed(Url, Directives) -> true | false
%% @doc Returns true if an unspecified user-agent is allowed to read Url.
%%
is_allowed(Url, List) ->
is_allowed("*", Url, List).
%% @spec is_allowed(User_agent, Url, Directives) -> true | false
%% @doc Returns true if User_agent is allowed to read Url.
%%
is_allowed("", Url, List) ->
% "" breaks hd("") below
is_allowed("*", Url, List);
is_allowed(Agent, Url, List) ->
% strip version tag after slash or space; "foobot/0.1" -> "foobot"
Agent1 = hd(string:tokens(string:strip(Agent, left), " /")),
% get the path only
case element(3, mochiweb_util:urlsplit(Url)) of
"" -> Url1 = "/"; % replace "" with "/"
X -> Url1 = X
end,
is_allowed(Agent1, Url1, List, false).
% return the first match
%
is_allowed(_Agent, _Url, [], _Match) ->
true;
is_allowed(_Agent, _Url, ['Disallow'], _Match) ->
% the parser put this here because of the HTTP respose code
false;
is_allowed(Agent, Url, [{'User-agent', "*"}|T], _Match) ->
is_allowed(Agent, Url, T, true);
is_allowed(Agent, Url, [{'User-agent', Agent}|T], _Match) ->
is_allowed(Agent, Url, T, true);
is_allowed(Agent, Url, [{'User-agent', _}|T], _Match) ->
is_allowed(Agent, Url, T, false);
is_allowed(Agent, Url, [_|T], false) ->
is_allowed(Agent, Url, T, false);
is_allowed(_Agent, _Url, [{'Disallow', ""}|_T], true) ->
true;
is_allowed(Agent, Url, [{'Disallow', Url1}|T], true) ->
case string:str(Url, Url1) of
1 ->
false;
_ ->
is_allowed(Agent, Url, T, true)
end;
is_allowed(Agent, Url, [_|T], true) ->
% ignore unknown directives
is_allowed(Agent, Url, T, true).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment