-
-
Save beerriot/28258f2a44fc482016b1 to your computer and use it in GitHub Desktop.
Demonstrations of a race condition in supervisor:monitor_child/1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% @doc Test for demonstrating the race condition in | |
%% supervisor:monitor_child/1. | |
%% | |
%% This test excercises code that has been extracted from | |
%% supervisor.erl to make the problem more obvious, and the test more | |
%% narrow. For a version that excercises the code directly in the | |
%% supervisor module, see exitrace_sup.erl. | |
%% | |
%% This test requires EQC PULSE to be installed. | |
%% | |
%% Running: | |
%% | |
%% compile:file("exitrace.erl", []). | |
%% eqc:module(exitrace_sup). | |
-module(exitrace). | |
-compile([export_all]). | |
-include_lib("pulse/include/pulse.hrl"). | |
-include_lib("eqc/include/eqc.hrl"). | |
-compile({parse_transform, pulse_instrument}). | |
%%% Our fake supervisor %%%%%%%%%%%%%%%%% | |
sup_start() -> | |
process_flag(trap_exit, true), | |
receive | |
{start, Test} -> | |
Child = spawn_link(fun child/0), | |
Test ! Child, | |
sup_run(Child) | |
end. | |
%% This is the dangerous race-prone bit | |
sup_run(Child) -> | |
receive | |
{stop, Test} -> | |
%% do like supervisor.erl | |
erlang:monitor(process, Child), | |
unlink(Child), | |
receive | |
{'EXIT', _, _}=Exit -> | |
sup_wait(Child, Test, Exit) | |
after 0 -> | |
%% this "after 0" is the problem bit - the EXIT | |
%% message may not have arrived yet, even though | |
%% the process may be dead enough for the new | |
%% monitor that was just set up to receive a DOWN | |
%% message with reason 'noproc' | |
exit(Child, kill), | |
sup_wait(Child, Test, undefined) | |
end | |
end. | |
sup_wait(Child, Test, Exit) -> | |
receive | |
{'DOWN', _, process, Child, _}=Down -> | |
Test ! {Exit, Down} | |
end. | |
%%% Our fake child %%%%%%%%%%%%%%%%% | |
child() -> | |
receive stop -> ok end. | |
%%% Test %%%%%%%%%%%%%%%%%%%%%%%%%%% | |
test_fun() -> | |
Sup = spawn_link(fun sup_start/0), | |
Sup ! {start, self()}, | |
receive | |
Child -> | |
Sup ! {stop, self()}, | |
Child ! stop, | |
receive Result -> Result end | |
end. | |
prop_never_noproc() -> | |
pulse:start(), | |
?PULSE({Exit, {'DOWN', _, _, _, Down}}, test_fun(), | |
?WHENFAIL( | |
io:format("Exit: ~p~nDown: ~p~n", [Exit, Down]), | |
Exit /= undefined orelse Down /= noproc)). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% @doc Test for demonstrating the race condition in | |
%% supervisor:monitor_child/1. | |
%% | |
%% If a child process decides to exit on its own at the same time | |
%% another process asks the child's supervisor to terminate it, the | |
%% child's exit may be reported as reason 'noproc'. | |
%% | |
%% This happens because the child's EXIT message is not guaranteed to | |
%% be delivered to the supervisor before the "after 0" timeout in | |
%% supervisor:monitor_child/1. If the erlang:monitor/2 call in that | |
%% function happens after the child exits, the supervisor receives a | |
%% DOWN message with reason 'noproc'. | |
%% | |
%% This is an EQC PULSE test to demonstrate the behavior. The | |
%% structure of the test is slightly unfortunate, because normally we | |
%% would just add PULSE annotations (via parse transform) to | |
%% supervisor.erl. That's a little cumbersome with a standard Erlang | |
%% distribution, so the pulse_supervisor module from the pulse_otp | |
%% project is used instead. The pulse_supervisor module contains the | |
%% same race condition. | |
%% | |
%% HOWEVER, the test is also set up to just use the regular supervisor | |
%% module if the PULSE macro is not defined. The test is | |
%% non-deterministic when run this way, but it has been observed to | |
%% still cause the same failure. | |
%% | |
%% Running: | |
%% | |
%% compile:file("exitrace_sup.erl", [{d, 'PULSE'}]). | |
%% eqc:module(exitrace_sup). | |
%% | |
%% Or, leave out the PULSE macro in the compile options to run without | |
%% PULSE against the standard supervisor. When doing so, you may want | |
%% to use | |
%% | |
%% eqc:quickcheck(eqc:numtests(100000, exitrace_sup:prop_never_noproc())). | |
%% | |
%% to give yourself more runs (and thus a better chance) to hit the | |
%% non-deterministic case. | |
%% | |
%% Failure output will look something like this: | |
%% | |
%% prop_never_noproc: .............Failed! After 14 tests. | |
%% {62493,44042,715} | |
%% DownReason: normal | |
%% SupReason: noproc | |
%% [prop_never_noproc] | |
%% | |
%% For a version of this test that extracts the problem code from | |
%% supervisor.erl to test it more narrowly, see exitrace.erl. | |
-module(exitrace_sup). | |
-compile([export_all]). | |
-include_lib("eqc/include/eqc.hrl"). | |
-ifdef(PULSE). | |
-include_lib("pulse/include/pulse.hrl"). | |
-compile({parse_transform, pulse_instrument}). | |
-compile({pulse_replace_module,[{supervisor,pulse_supervisor}]}). | |
-endif. | |
%% supervisor callback | |
init([]) -> | |
RestartStrategy = simple_one_for_one, | |
MaxRestarts = 1000, | |
MaxSecondsBetweenRestarts = 3600, | |
SupFlags = {RestartStrategy, MaxRestarts, MaxSecondsBetweenRestarts}, | |
Restart = temporary, | |
Shutdown = brutal_kill, | |
Type = worker, | |
Child = {undefined, | |
{?MODULE, start_fake_child_link, []}, | |
Restart, Shutdown, Type, [?MODULE]}, | |
{ok, {SupFlags, [Child]}}. | |
%% The child waits for a signal before exiting, only so the test can | |
%% be sure to get a monitor attached to it before it dies. | |
start_fake_child_link() -> | |
{ok, spawn_link(fun() -> receive stop -> ok end end)}. | |
test_fun() -> | |
%% our processes under test | |
{ok, Sup} = supervisor:start_link(?MODULE, []), | |
{ok, Child} = supervisor:start_child(Sup, []), | |
%% the monitoring we need | |
%% technically, we don't need to monitor the child since that | |
%% simple process should *always* exit 'normal', but for | |
%% completeness' sake, we do anyway | |
erlang:monitor(process, Child), | |
erlang:trace(Sup, true, [call]), | |
%% since supervisor gives no other hooks into its events, we'll | |
%% monitor its calls to report_error, and watch for the | |
%% `shutdown_error` context | |
%% includes both pulse & non-pulse modules, so test can be run | |
%% either way | |
TP = [{pulse_supervisor, report_error, 4}, | |
{supervisor, report_error, 4}], | |
[ erlang:trace_pattern(P, true, [local]) || P <- TP ], | |
%% the moment of test | |
Child ! stop, | |
supervisor:terminate_child(Sup, Child), | |
%% pick up all the results | |
DownReason = receive | |
{'DOWN', _MRef, process, Child, DR} -> | |
DR | |
end, | |
TRef = erlang:trace_delivered(Sup), | |
SupReason = receive | |
{trace, Sup, call, | |
{pulse_supervisor, report_error, | |
[shutdown_error, SR, _, _]}} -> | |
SR; | |
{trace, Sup, call, | |
{supervisor, report_error, | |
[shutdown_error, SR, _, _]}} -> | |
SR; | |
{trace_delivered, Sup, TRef} -> | |
undefined | |
end, | |
%% clenaup | |
[ erlang:trace_pattern(P, false, [local]) || P <- TP ], | |
erlang:trace(Sup, false, [call]), | |
unlink(Sup), | |
exit(Sup, kill), | |
%% reasons for the test to interpret | |
{DownReason, SupReason}. | |
-ifdef(PULSE). | |
prop_never_noproc() -> | |
pulse:start(), | |
?PULSE(Result, test_fun(), is_not_noproc(Result)). | |
-else. | |
%% this is completely non-deterministic, but allows the test to be run | |
%% with the actual supervisor instad of pulse_supervisor; has been | |
%% observed to fail at least once | |
prop_never_noproc() -> | |
?FORALL(_, int(), is_not_noproc(test_fun())). | |
-endif. | |
is_not_noproc({DownReason, SupReason}) -> | |
?WHENFAIL(io:format("DownReason: ~p~nSupReason: ~p~n", | |
[DownReason, SupReason]), | |
DownReason /= noproc | |
andalso (SupReason == DownReason | |
orelse SupReason == undefined)). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment