Skip to content

Commit

Permalink
Normalize user agents (#14)
Browse files Browse the repository at this point in the history
  • Loading branch information
AntoineGagne committed Jan 21, 2023
1 parent 3acf29e commit 22a63f4
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 9 deletions.
19 changes: 12 additions & 7 deletions src/robots.erl
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ is_allowed(_Agent, _Url, {allowed, all}) ->
true;
is_allowed(_Agent, _Url, {disallowed, all}) ->
false;
is_allowed(Agent, Url, RulesIndex) ->
Reversed = reverse(Agent),
MaybeRules = find_agent_rules(Reversed, RulesIndex),
is_allowed(RawAgent, Url, RulesIndex) ->
Agent = to_agent(RawAgent),
MaybeRules = find_agent_rules(Agent, RulesIndex),
is_allowed(Url, MaybeRules).

-spec sitemap(agent_rules()) -> {ok, sitemap()} | {error, not_found}.
Expand Down Expand Up @@ -145,11 +145,11 @@ trim(String) ->

-spec build_rules({binary(), binary()}, {[agent()], boolean(), rules_index()}) ->
{[agent()], boolean(), rules_index()}.
build_rules({<<"user-agent">>, Agent}, {Agents, false, RulesIndex}) ->
Reversed = reverse(Agent),
build_rules({<<"user-agent">>, RawAgent}, {Agents, false, RulesIndex}) ->
Reversed = to_agent(RawAgent),
{[Reversed | Agents], false, RulesIndex};
build_rules({<<"user-agent">>, Agent}, {_Agents, true, RulesIndex}) ->
Reversed = reverse(Agent),
build_rules({<<"user-agent">>, RawAgent}, {_Agents, true, RulesIndex}) ->
Reversed = to_agent(RawAgent),
{[Reversed], false, RulesIndex};
build_rules({<<"allow">>, Rule}, {Agents, _, RulesIndex}) ->
{_, UpdatedIndex} = lists:foldl(fun update_index/2, {{allowed, Rule}, RulesIndex}, Agents),
Expand Down Expand Up @@ -206,6 +206,11 @@ match(<<A, R1/binary>>, <<A, R2/binary>>) ->
match(<<_, _/binary>>, <<_, _/binary>>) ->
false.

-spec to_agent(Raw :: binary()) -> unicode:chardata().
to_agent(Raw) ->
Reversed = reverse(Raw),
string:lowercase(Reversed).

%% Taken from: https://stackoverflow.com/a/43310493
-spec reverse(binary()) -> binary().
reverse(Binary) ->
Expand Down
31 changes: 30 additions & 1 deletion test/robots_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
-define(CODE_5XX, 514).
-define(EMPTY_CONTENT, <<>>).
-define(USER_AGENT, <<"bot/1.0.0">>).
-define(ANOTHER_USER_AGENT, <<"BoT/1.0.0">>).
-define(REVERSED_USER_AGENT, <<"0.0.1/tob">>).
-define(NON_EXISTENT_USER_AGENT, <<"nonexistent/1.0.0">>).
-define(AN_URL, <<"/bot-url">>).
Expand All @@ -24,6 +25,10 @@
<<"User-Agent: ", ?USER_AGENT/binary, "\nAllow: ", ?A_RULE/binary, "\nDisallow: ",
?ANOTHER_RULE/binary>>
).
-define(SOME_CONTENT_WITH_REPEATED_AGENTS,
<<"User-Agent: ", ?USER_AGENT/binary, "\nAllow: ", ?A_RULE/binary, "\nUser-Agent: ",
?ANOTHER_USER_AGENT/binary, "\nDisallow: ", ?ANOTHER_RULE/binary>>
).

-define(A_VALID_CONTENT_WITH_COMMENT, <<?A_VALID_CONTENT/binary, "# this is a comment">>).
-define(A_MALFORMED_CONTENT, <<"User-Agent: ", ?USER_AGENT/binary, "\n", ?A_RULE/binary>>).
Expand All @@ -44,11 +49,13 @@ groups() ->
can_parse_valid_robots_txt,
can_parse_valid_non_binary_robots_txt,
can_handle_malformed_content,
merge_repeated_agent,
can_fetch_sitemap,
return_error_on_non_existent_sitemap,
allow_all_on_unmatched_agents_at_end_of_file,
ignore_inline_comments,
return_true_if_agent_is_allowed,
match_independently_of_the_casing_of_the_agent,
return_false_if_agent_is_disallowed,
return_true_if_no_matching_rules_can_be_found,
return_true_if_everything_is_allowed_for_the_corresponding_agent
Expand Down Expand Up @@ -154,6 +161,17 @@ allow_all_on_unmatched_agents_at_end_of_file(_Config) ->
robots:parse(<<"User-Agent: ", ?USER_AGENT/binary>>, ?A_VALID_CODE)
).

merge_repeated_agent() ->
[
{doc,
"Given a rules index with the same user agent repeated, when parsing, then merges the rules."}
].
merge_repeated_agent(_Config) ->
?assertMatch(
{ok, #{?REVERSED_USER_AGENT := {[<<"/foo/*">>], [<<"/bar">>]}}},
robots:parse(?SOME_CONTENT_WITH_REPEATED_AGENTS, ?A_VALID_CODE)
).

ignore_inline_comments() ->
[{doc, "Given a rule with a comment in it, when parsing, then ignores the comment."}].

Expand All @@ -175,6 +193,18 @@ return_true_if_agent_is_allowed(_Config) ->

?assert(robots:is_allowed(?USER_AGENT, ?A_MATCHING_URL, RulesIndex)).

match_independently_of_the_casing_of_the_agent() ->
[
{doc,
"Given a rules index with allowed URL for the corresponding agent, "
"when checking if allowed with the allowed agent in different casing, "
"then returns true."}
].
match_independently_of_the_casing_of_the_agent(_Config) ->
{ok, RulesIndex} = robots:parse(?ANOTHER_VALID_CONTENT, ?A_VALID_CODE),

?assert(robots:is_allowed(string:uppercase(?USER_AGENT), ?A_MATCHING_URL, RulesIndex)).

return_false_if_agent_is_disallowed() ->
[
{doc,
Expand Down Expand Up @@ -205,7 +235,6 @@ return_true_if_everything_is_allowed_for_the_corresponding_agent() ->
"Given a rules index with an agent for which everything is allowed, "
"when checking if allowed, then returns true."}
].

return_true_if_everything_is_allowed_for_the_corresponding_agent(_Config) ->
{ok, RulesIndex} = robots:parse(<<"User-Agent: ", ?USER_AGENT/binary>>, ?A_VALID_CODE),

Expand Down
2 changes: 1 addition & 1 deletion test/robots_integration_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ can_parse_valid_robots_txt(Config) ->
Valid = ?config(valid, Config),

?assertMatch(
{ok, #{<<"tobrettiwT">> := {[<<"/imgres">>], []}}},
{ok, #{<<"tobrettiwt">> := {[<<"/imgres">>], []}}},
robots:parse(Valid, ?A_VALID_CODE)
).

Expand Down

0 comments on commit 22a63f4

Please sign in to comment.