Created
September 26, 2012 18:24
-
-
Save aprimc/3789675 to your computer and use it in GitHub Desktop.
robots.txt parser for Erlang
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% @author Andrej Primc | |
%% @copyright 2011 Andrej Primc | |
%% @doc Utility for parsing robots.txt file. | |
-module(robots_txt). | |
-export([parse/1, parse/2, is_allowed/2, is_allowed/3]). | |
%% @spec parse(Text, Http_response_code) -> directives() | |
%% @doc Parse text and HTTP response code into a list of directives. | |
%% | |
parse(_Text, 404) -> | |
[]; % not found -> allow | |
parse(_Text, Code) when Code >= 500 -> | |
[]; % error -> allow | |
parse(_Text, Code) when Code >= 400 -> | |
['Disallow']; % 40x -> dissallow | |
parse(Text, _Code) -> | |
parse(Text). | |
%% @spec parse(Text) -> directives() | |
%% @doc Parse text into a list of directives. | |
%% | |
parse(Text) -> | |
% split lines | |
Lines = lists:map(fun string:strip/1, string:tokens(Text, "\n\r")), | |
parse_lines(Lines, []). | |
% match first directive | |
% | |
parse_lines([], Dirs) -> | |
lists:reverse(Dirs); | |
parse_lines([""|Lines], Dirs) -> | |
% skip empty | |
parse_lines(Lines, Dirs); | |
parse_lines([[$#|_]|Lines], Dirs) -> | |
% skip comments | |
parse_lines(Lines, Dirs); | |
parse_lines([Line|Lines], Dirs) -> | |
% try to match "Directive: Value" | |
{D, P, R} = mochiweb_util:partition(Line, ":"), | |
case P of | |
":" -> | |
Dl = string:to_lower(string:strip(D)), | |
Rs = string:strip(R), | |
case Dl of | |
"user-agent" -> | |
parse_lines(Lines, [{'User-agent', Rs}|Dirs]); | |
"disallow" -> | |
parse_lines(Lines, [{'Disallow', Rs}|Dirs]); | |
_ -> | |
parse_lines(Lines, Dirs) | |
end; | |
_ -> | |
parse_lines(Lines, Dirs) | |
end. | |
%% @spec is_allowed(Url, Directives) -> true | false | |
%% @doc Returns true if an unspecified user-agent is allowed to read Url. | |
%% | |
is_allowed(Url, List) -> | |
is_allowed("*", Url, List). | |
%% @spec is_allowed(User_agent, Url, Directives) -> true | false | |
%% @doc Returns true if User_agent is allowed to read Url. | |
%% | |
is_allowed("", Url, List) -> | |
% "" breaks hd("") below | |
is_allowed("*", Url, List); | |
is_allowed(Agent, Url, List) -> | |
% strip version tag after slash or space; "foobot/0.1" -> "foobot" | |
Agent1 = hd(string:tokens(string:strip(Agent, left), " /")), | |
% get the path only | |
case element(3, mochiweb_util:urlsplit(Url)) of | |
"" -> Url1 = "/"; % replace "" with "/" | |
X -> Url1 = X | |
end, | |
is_allowed(Agent1, Url1, List, false). | |
% return the first match | |
% | |
is_allowed(_Agent, _Url, [], _Match) -> | |
true; | |
is_allowed(_Agent, _Url, ['Disallow'], _Match) -> | |
% the parser put this here because of the HTTP respose code | |
false; | |
is_allowed(Agent, Url, [{'User-agent', "*"}|T], _Match) -> | |
is_allowed(Agent, Url, T, true); | |
is_allowed(Agent, Url, [{'User-agent', Agent}|T], _Match) -> | |
is_allowed(Agent, Url, T, true); | |
is_allowed(Agent, Url, [{'User-agent', _}|T], _Match) -> | |
is_allowed(Agent, Url, T, false); | |
is_allowed(Agent, Url, [_|T], false) -> | |
is_allowed(Agent, Url, T, false); | |
is_allowed(_Agent, _Url, [{'Disallow', ""}|_T], true) -> | |
true; | |
is_allowed(Agent, Url, [{'Disallow', Url1}|T], true) -> | |
case string:str(Url, Url1) of | |
1 -> | |
false; | |
_ -> | |
is_allowed(Agent, Url, T, true) | |
end; | |
is_allowed(Agent, Url, [_|T], true) -> | |
% ignore unknown directives | |
is_allowed(Agent, Url, T, true). | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment