aprimc/robots_txt.erl

## robots_txt.erl
%% @author Andrej Primc
%% @copyright 2011 Andrej Primc

%% @doc Utility for parsing robots.txt file.


-module(robots_txt).
-export([parse/1, parse/2, is_allowed/2, is_allowed/3]).


%% @spec parse(Text, Http_response_code) -> directives()
%% @doc Parse text and HTTP response code into a list of directives.
%%
parse(_Text, 404) ->
	[];	% not found -> allow
parse(_Text, Code) when Code >= 500 ->
	[];	% error -> allow
parse(_Text, Code) when Code >= 400 ->
	['Disallow'];	% 40x -> dissallow
parse(Text, _Code) ->
	parse(Text).


%% @spec parse(Text) -> directives()
%% @doc Parse text into a list of directives.
%%
parse(Text) ->
	% split lines
	Lines = lists:map(fun string:strip/1, string:tokens(Text, "\n\r")),
	parse_lines(Lines, []).


% match first directive
%
parse_lines([], Dirs) ->
	lists:reverse(Dirs);
parse_lines([""|Lines], Dirs) ->
	% skip empty
	parse_lines(Lines, Dirs);
parse_lines([[$#|_]|Lines], Dirs) ->
	% skip comments
	parse_lines(Lines, Dirs);
parse_lines([Line|Lines], Dirs) ->
	% try to match "Directive: Value"
	{D, P, R} = mochiweb_util:partition(Line, ":"),
	case P of
		":" ->
			Dl = string:to_lower(string:strip(D)),
			Rs = string:strip(R),
			case Dl of
				"user-agent" ->
					parse_lines(Lines, [{'User-agent', Rs}|Dirs]);
				"disallow" ->
					parse_lines(Lines, [{'Disallow', Rs}|Dirs]);
				_ ->
					parse_lines(Lines, Dirs)
			end;
		_ ->
			parse_lines(Lines, Dirs)
	end.


%% @spec is_allowed(Url, Directives) -> true | false
%% @doc Returns true if an unspecified user-agent is allowed to read Url.
%%
is_allowed(Url, List) ->
	is_allowed("*", Url, List).


%% @spec is_allowed(User_agent, Url, Directives) -> true | false
%% @doc Returns true if User_agent is allowed to read Url.
%%
is_allowed("", Url, List) ->
	% "" breaks hd("") below
	is_allowed("*", Url, List);
is_allowed(Agent, Url, List) ->
	% strip version tag after slash or space; "foobot/0.1" -> "foobot"
	Agent1 = hd(string:tokens(string:strip(Agent, left), " /")),
	% get the path only
	case element(3, mochiweb_util:urlsplit(Url)) of
		"" -> Url1 = "/";	% replace "" with "/"
		X -> Url1 = X
	end,
	is_allowed(Agent1, Url1, List, false).

% return the first match
%
is_allowed(_Agent, _Url, [], _Match) ->
	true;
is_allowed(_Agent, _Url, ['Disallow'], _Match) ->
	% the parser put this here because of the HTTP respose code
	false;
is_allowed(Agent, Url, [{'User-agent', "*"}|T], _Match) ->
	is_allowed(Agent, Url, T, true);
is_allowed(Agent, Url, [{'User-agent', Agent}|T], _Match) ->
	is_allowed(Agent, Url, T, true);
is_allowed(Agent, Url, [{'User-agent', _}|T], _Match) ->
	is_allowed(Agent, Url, T, false);
is_allowed(Agent, Url, [_|T], false) ->
	is_allowed(Agent, Url, T, false);
is_allowed(_Agent, _Url, [{'Disallow', ""}|_T], true) ->
	true;
is_allowed(Agent, Url, [{'Disallow', Url1}|T], true) ->
	case string:str(Url, Url1) of
		1 ->
			false;
		_ ->
			is_allowed(Agent, Url, T, true)
	end;
is_allowed(Agent, Url, [_|T], true) ->
	% ignore unknown directives
	is_allowed(Agent, Url, T, true).
	%% @author Andrej Primc
	%% @copyright 2011 Andrej Primc

	%% @doc Utility for parsing robots.txt file.


	-module(robots_txt).
	-export([parse/1, parse/2, is_allowed/2, is_allowed/3]).


	%% @spec parse(Text, Http_response_code) -> directives()
	%% @doc Parse text and HTTP response code into a list of directives.
	%%
	parse(_Text, 404) ->
	[]; % not found -> allow
	parse(_Text, Code) when Code >= 500 ->
	[]; % error -> allow
	parse(_Text, Code) when Code >= 400 ->
	['Disallow']; % 40x -> dissallow
	parse(Text, _Code) ->
	parse(Text).


	%% @spec parse(Text) -> directives()
	%% @doc Parse text into a list of directives.
	%%
	parse(Text) ->
	% split lines
	Lines = lists:map(fun string:strip/1, string:tokens(Text, "\n\r")),
	parse_lines(Lines, []).


	% match first directive
	%
	parse_lines([], Dirs) ->
	lists:reverse(Dirs);
	parse_lines([""\|Lines], Dirs) ->
	% skip empty
	parse_lines(Lines, Dirs);
	parse_lines([[$#\|_]\|Lines], Dirs) ->
	% skip comments
	parse_lines(Lines, Dirs);
	parse_lines([Line\|Lines], Dirs) ->
	% try to match "Directive: Value"
	{D, P, R} = mochiweb_util:partition(Line, ":"),
	case P of
	":" ->
	Dl = string:to_lower(string:strip(D)),
	Rs = string:strip(R),
	case Dl of
	"user-agent" ->
	parse_lines(Lines, [{'User-agent', Rs}\|Dirs]);
	"disallow" ->
	parse_lines(Lines, [{'Disallow', Rs}\|Dirs]);
	_ ->
	parse_lines(Lines, Dirs)
	end;
	_ ->
	parse_lines(Lines, Dirs)
	end.


	%% @spec is_allowed(Url, Directives) -> true \| false
	%% @doc Returns true if an unspecified user-agent is allowed to read Url.
	%%
	is_allowed(Url, List) ->
	is_allowed("*", Url, List).


	%% @spec is_allowed(User_agent, Url, Directives) -> true \| false
	%% @doc Returns true if User_agent is allowed to read Url.
	%%
	is_allowed("", Url, List) ->
	% "" breaks hd("") below
	is_allowed("*", Url, List);
	is_allowed(Agent, Url, List) ->
	% strip version tag after slash or space; "foobot/0.1" -> "foobot"
	Agent1 = hd(string:tokens(string:strip(Agent, left), " /")),
	% get the path only
	case element(3, mochiweb_util:urlsplit(Url)) of
	"" -> Url1 = "/"; % replace "" with "/"
	X -> Url1 = X
	end,
	is_allowed(Agent1, Url1, List, false).

	% return the first match
	%
	is_allowed(_Agent, _Url, [], _Match) ->
	true;
	is_allowed(_Agent, _Url, ['Disallow'], _Match) ->
	% the parser put this here because of the HTTP respose code
	false;
	is_allowed(Agent, Url, [{'User-agent', "*"}\|T], _Match) ->
	is_allowed(Agent, Url, T, true);
	is_allowed(Agent, Url, [{'User-agent', Agent}\|T], _Match) ->
	is_allowed(Agent, Url, T, true);
	is_allowed(Agent, Url, [{'User-agent', _}\|T], _Match) ->
	is_allowed(Agent, Url, T, false);
	is_allowed(Agent, Url, [_\|T], false) ->
	is_allowed(Agent, Url, T, false);
	is_allowed(_Agent, _Url, [{'Disallow', ""}\|_T], true) ->
	true;
	is_allowed(Agent, Url, [{'Disallow', Url1}\|T], true) ->
	case string:str(Url, Url1) of
	1 ->
	false;
	_ ->
	is_allowed(Agent, Url, T, true)
	end;
	is_allowed(Agent, Url, [_\|T], true) ->
	% ignore unknown directives
	is_allowed(Agent, Url, T, true).