Skip to content

Instantly share code, notes, and snippets.

@beerriot
Created April 28, 2013 22:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beerriot/28258f2a44fc482016b1 to your computer and use it in GitHub Desktop.
Save beerriot/28258f2a44fc482016b1 to your computer and use it in GitHub Desktop.
Demonstrations of a race condition in supervisor:monitor_child/1.
%% @doc Test for demonstrating the race condition in
%% supervisor:monitor_child/1.
%%
%% This test excercises code that has been extracted from
%% supervisor.erl to make the problem more obvious, and the test more
%% narrow. For a version that excercises the code directly in the
%% supervisor module, see exitrace_sup.erl.
%%
%% This test requires EQC PULSE to be installed.
%%
%% Running:
%%
%% compile:file("exitrace.erl", []).
%% eqc:module(exitrace_sup).
-module(exitrace).
-compile([export_all]).
-include_lib("pulse/include/pulse.hrl").
-include_lib("eqc/include/eqc.hrl").
-compile({parse_transform, pulse_instrument}).
%%% Our fake supervisor %%%%%%%%%%%%%%%%%
sup_start() ->
process_flag(trap_exit, true),
receive
{start, Test} ->
Child = spawn_link(fun child/0),
Test ! Child,
sup_run(Child)
end.
%% This is the dangerous race-prone bit
sup_run(Child) ->
receive
{stop, Test} ->
%% do like supervisor.erl
erlang:monitor(process, Child),
unlink(Child),
receive
{'EXIT', _, _}=Exit ->
sup_wait(Child, Test, Exit)
after 0 ->
%% this "after 0" is the problem bit - the EXIT
%% message may not have arrived yet, even though
%% the process may be dead enough for the new
%% monitor that was just set up to receive a DOWN
%% message with reason 'noproc'
exit(Child, kill),
sup_wait(Child, Test, undefined)
end
end.
sup_wait(Child, Test, Exit) ->
receive
{'DOWN', _, process, Child, _}=Down ->
Test ! {Exit, Down}
end.
%%% Our fake child %%%%%%%%%%%%%%%%%
child() ->
receive stop -> ok end.
%%% Test %%%%%%%%%%%%%%%%%%%%%%%%%%%
test_fun() ->
Sup = spawn_link(fun sup_start/0),
Sup ! {start, self()},
receive
Child ->
Sup ! {stop, self()},
Child ! stop,
receive Result -> Result end
end.
prop_never_noproc() ->
pulse:start(),
?PULSE({Exit, {'DOWN', _, _, _, Down}}, test_fun(),
?WHENFAIL(
io:format("Exit: ~p~nDown: ~p~n", [Exit, Down]),
Exit /= undefined orelse Down /= noproc)).
%% @doc Test for demonstrating the race condition in
%% supervisor:monitor_child/1.
%%
%% If a child process decides to exit on its own at the same time
%% another process asks the child's supervisor to terminate it, the
%% child's exit may be reported as reason 'noproc'.
%%
%% This happens because the child's EXIT message is not guaranteed to
%% be delivered to the supervisor before the "after 0" timeout in
%% supervisor:monitor_child/1. If the erlang:monitor/2 call in that
%% function happens after the child exits, the supervisor receives a
%% DOWN message with reason 'noproc'.
%%
%% This is an EQC PULSE test to demonstrate the behavior. The
%% structure of the test is slightly unfortunate, because normally we
%% would just add PULSE annotations (via parse transform) to
%% supervisor.erl. That's a little cumbersome with a standard Erlang
%% distribution, so the pulse_supervisor module from the pulse_otp
%% project is used instead. The pulse_supervisor module contains the
%% same race condition.
%%
%% HOWEVER, the test is also set up to just use the regular supervisor
%% module if the PULSE macro is not defined. The test is
%% non-deterministic when run this way, but it has been observed to
%% still cause the same failure.
%%
%% Running:
%%
%% compile:file("exitrace_sup.erl", [{d, 'PULSE'}]).
%% eqc:module(exitrace_sup).
%%
%% Or, leave out the PULSE macro in the compile options to run without
%% PULSE against the standard supervisor. When doing so, you may want
%% to use
%%
%% eqc:quickcheck(eqc:numtests(100000, exitrace_sup:prop_never_noproc())).
%%
%% to give yourself more runs (and thus a better chance) to hit the
%% non-deterministic case.
%%
%% Failure output will look something like this:
%%
%% prop_never_noproc: .............Failed! After 14 tests.
%% {62493,44042,715}
%% DownReason: normal
%% SupReason: noproc
%% [prop_never_noproc]
%%
%% For a version of this test that extracts the problem code from
%% supervisor.erl to test it more narrowly, see exitrace.erl.
-module(exitrace_sup).
-compile([export_all]).
-include_lib("eqc/include/eqc.hrl").
-ifdef(PULSE).
-include_lib("pulse/include/pulse.hrl").
-compile({parse_transform, pulse_instrument}).
-compile({pulse_replace_module,[{supervisor,pulse_supervisor}]}).
-endif.
%% supervisor callback
init([]) ->
RestartStrategy = simple_one_for_one,
MaxRestarts = 1000,
MaxSecondsBetweenRestarts = 3600,
SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts},
Restart = temporary,
Shutdown = brutal_kill,
Type = worker,
Child = {undefined,
{?MODULE, start_fake_child_link, []},
Restart, Shutdown, Type, [?MODULE]},
{ok, {SupFlags, [Child]}}.
%% The child waits for a signal before exiting, only so the test can
%% be sure to get a monitor attached to it before it dies.
start_fake_child_link() ->
{ok, spawn_link(fun() -> receive stop -> ok end end)}.
test_fun() ->
%% our processes under test
{ok, Sup} = supervisor:start_link(?MODULE, []),
{ok, Child} = supervisor:start_child(Sup, []),
%% the monitoring we need
%% technically, we don't need to monitor the child since that
%% simple process should *always* exit 'normal', but for
%% completeness' sake, we do anyway
erlang:monitor(process, Child),
erlang:trace(Sup, true, [call]),
%% since supervisor gives no other hooks into its events, we'll
%% monitor its calls to report_error, and watch for the
%% `shutdown_error` context
%% includes both pulse & non-pulse modules, so test can be run
%% either way
TP = [{pulse_supervisor, report_error, 4},
{supervisor, report_error, 4}],
[ erlang:trace_pattern(P, true, [local]) || P <- TP ],
%% the moment of test
Child ! stop,
supervisor:terminate_child(Sup, Child),
%% pick up all the results
DownReason = receive
{'DOWN', _MRef, process, Child, DR} ->
DR
end,
TRef = erlang:trace_delivered(Sup),
SupReason = receive
{trace, Sup, call,
{pulse_supervisor, report_error,
[shutdown_error, SR, _, _]}} ->
SR;
{trace, Sup, call,
{supervisor, report_error,
[shutdown_error, SR, _, _]}} ->
SR;
{trace_delivered, Sup, TRef} ->
undefined
end,
%% clenaup
[ erlang:trace_pattern(P, false, [local]) || P <- TP ],
erlang:trace(Sup, false, [call]),
unlink(Sup),
exit(Sup, kill),
%% reasons for the test to interpret
{DownReason, SupReason}.
-ifdef(PULSE).
prop_never_noproc() ->
pulse:start(),
?PULSE(Result, test_fun(), is_not_noproc(Result)).
-else.
%% this is completely non-deterministic, but allows the test to be run
%% with the actual supervisor instad of pulse_supervisor; has been
%% observed to fail at least once
prop_never_noproc() ->
?FORALL(_, int(), is_not_noproc(test_fun())).
-endif.
is_not_noproc({DownReason, SupReason}) ->
?WHENFAIL(io:format("DownReason: ~p~nSupReason: ~p~n",
[DownReason, SupReason]),
DownReason /= noproc
andalso (SupReason == DownReason
orelse SupReason == undefined)).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment