Skip to content

Instantly share code, notes, and snippets.

@pichi
Last active March 30, 2016 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pichi/2d10c93242d5057913d026a607f07dd4 to your computer and use it in GitHub Desktop.
Save pichi/2d10c93242d5057913d026a607f07dd4 to your computer and use it in GitHub Desktop.
Stopwords Benchmark
$ erl -pa eministat/ebin
Erlang/OTP 18 [erts-7.3] [source] [64-bit] [smp:4:4] [async-threads:10] [hipe] [kernel-poll:false]
Eshell V7.3 (abort with ^G)
1> {ok, Bin} = file:read_file("/home/hynek/Downloads/words.txt"), L = string:tokens(binary_to_list(Bin), "\s\r\n"), length(L).
113809
2> length(lists:filter(fun stopwords_clause:is_stopword/1, L)).
122
3> length(lists:filter(fun stopwords_map:is_stopword/1, L)).
122
4> Clause = eministat:s("clause", fun() -> lists:filter(fun stopwords_clause:is_stopword/1, L) end, 50).
{dataset,"clause",
[3490,3493,3498,3501,3504,3507,3513,3539,3541,3544,3548,
3549,3551,3552,3554,3557,3559,3560,3562,3564,3570,3571,3581,
3589,3591,3611|...],
181508.0,6.595332e8,50}
5> Map = eministat:s("map", fun() -> lists:filter(fun stopwords_map:is_stopword/1, L) end, 50).
{dataset,"map",
[10950,10965,10971,10978,10982,10983,10988,10993,10998,
11002,11012,11013,11016,11017,11017,11019,11021,11025,11026,
11026,11028,11030,11035,11038,11040,11045|...],
555276.0,6170067514.0,50}
6> eministat:x(95.0, Clause, Map).
x clause
+ map
+--------------------------------------------------------------------------+
|xxxxx +++++ +|
|xxxx ++++ |
|xxxx +++ |
|xxxx ++ |
|xxx ++ |
|xxx ++ |
|xx ++ |
|xx ++ |
|xx ++ |
|xx + |
|xx + |
|xx + |
|xx + |
|xx + |
| x + |
| x + |
| x + |
| x + |
| x + |
| x + |
| x + |
| x + |
| x + |
| x + |
| x + |
| + |
| + |
| + |
| + |
| + |
| + |
| + |
| + |
| + |
||A| |
| |_MA_| |
+--------------------------------------------------------------------------+
Dataset: x N=50 CI=95.0000
Statistic Value [ Bias] (Bootstrapped LB‥UB)
Min: 3490.00
1st Qu. 3551.00
Median: 3591.00
3rd Qu. 3679.00
Max: 3945.00
Average: 3630.16 [ 0.137534] ( 3602.82 ‥ 3664.56)
Std. Dev: 113.400 [ -1.81311] ( 90.8425 ‥ 141.539)
Outliers: 0/4 = 4 (μ=3630.30, σ=111.587)
Outlier variance: 0.151802 (moderate)
------
Dataset: + N=50 CI=95.0000
Statistic Value [ Bias] (Bootstrapped LB‥UB)
Min: 1.09500e+4
1st Qu. 1.10160e+4
Median: 1.10400e+4
3rd Qu. 1.11270e+4
Max: 1.28270e+4
Average: 1.11055e+4 [ 0.297998] ( 1.10611e+4 ‥ 1.12491e+4)
Std. Dev: 264.914 [ -31.0673] ( 84.7956 ‥ 582.629)
Outliers: 0/2 = 2 (μ=1.11058e+4, σ=233.847)
Outlier variance: 9.45082e-2 (slight)
Difference at 95.0% confidence
7475.36 ± 80.8533
205.924% ± 2.22726%
(Student's t, pooled s = 203.763)
------
ok
-module(stopwords_clause).
-export([is_stopword/1]).
is_stopword("a") -> true;
is_stopword("about") -> true;
is_stopword("above") -> true;
is_stopword("after") -> true;
is_stopword("again") -> true;
is_stopword("against") -> true;
is_stopword("all") -> true;
is_stopword("am") -> true;
is_stopword("an") -> true;
is_stopword("and") -> true;
is_stopword("any") -> true;
is_stopword("are") -> true;
is_stopword("aren't") -> true;
is_stopword("as") -> true;
is_stopword("at") -> true;
is_stopword("be") -> true;
is_stopword("because") -> true;
is_stopword("been") -> true;
is_stopword("before") -> true;
is_stopword("being") -> true;
is_stopword("below") -> true;
is_stopword("between") -> true;
is_stopword("both") -> true;
is_stopword("but") -> true;
is_stopword("by") -> true;
is_stopword("can't") -> true;
is_stopword("cannot") -> true;
is_stopword("could") -> true;
is_stopword("couldn't") -> true;
is_stopword("did") -> true;
is_stopword("didn't") -> true;
is_stopword("do") -> true;
is_stopword("does") -> true;
is_stopword("doesn't") -> true;
is_stopword("doing") -> true;
is_stopword("don't") -> true;
is_stopword("down") -> true;
is_stopword("during") -> true;
is_stopword("each") -> true;
is_stopword("few") -> true;
is_stopword("for") -> true;
is_stopword("from") -> true;
is_stopword("further") -> true;
is_stopword("had") -> true;
is_stopword("hadn't") -> true;
is_stopword("has") -> true;
is_stopword("hasn't") -> true;
is_stopword("have") -> true;
is_stopword("haven't") -> true;
is_stopword("having") -> true;
is_stopword("he") -> true;
is_stopword("he'd") -> true;
is_stopword("he'll") -> true;
is_stopword("he's") -> true;
is_stopword("her") -> true;
is_stopword("here") -> true;
is_stopword("here's") -> true;
is_stopword("hers") -> true;
is_stopword("herself") -> true;
is_stopword("him") -> true;
is_stopword("himself") -> true;
is_stopword("his") -> true;
is_stopword("how") -> true;
is_stopword("how's") -> true;
is_stopword("i") -> true;
is_stopword("i'd") -> true;
is_stopword("i'll") -> true;
is_stopword("i'm") -> true;
is_stopword("i've") -> true;
is_stopword("if") -> true;
is_stopword("in") -> true;
is_stopword("into") -> true;
is_stopword("is") -> true;
is_stopword("isn't") -> true;
is_stopword("it") -> true;
is_stopword("it's") -> true;
is_stopword("its") -> true;
is_stopword("itself") -> true;
is_stopword("let's") -> true;
is_stopword("me") -> true;
is_stopword("more") -> true;
is_stopword("most") -> true;
is_stopword("mustn't") -> true;
is_stopword("my") -> true;
is_stopword("myself") -> true;
is_stopword("no") -> true;
is_stopword("nor") -> true;
is_stopword("not") -> true;
is_stopword("of") -> true;
is_stopword("off") -> true;
is_stopword("on") -> true;
is_stopword("once") -> true;
is_stopword("only") -> true;
is_stopword("or") -> true;
is_stopword("other") -> true;
is_stopword("ought") -> true;
is_stopword("our") -> true;
is_stopword("ours") -> true;
is_stopword("ourselves") -> true;
is_stopword("out") -> true;
is_stopword("over") -> true;
is_stopword("own") -> true;
is_stopword("same") -> true;
is_stopword("shan't") -> true;
is_stopword("she") -> true;
is_stopword("she'd") -> true;
is_stopword("she'll") -> true;
is_stopword("she's") -> true;
is_stopword("should") -> true;
is_stopword("shouldn't") -> true;
is_stopword("so") -> true;
is_stopword("some") -> true;
is_stopword("such") -> true;
is_stopword("than") -> true;
is_stopword("that") -> true;
is_stopword("that's") -> true;
is_stopword("the") -> true;
is_stopword("their") -> true;
is_stopword("theirs") -> true;
is_stopword("them") -> true;
is_stopword("themselves") -> true;
is_stopword("then") -> true;
is_stopword("there") -> true;
is_stopword("there's") -> true;
is_stopword("these") -> true;
is_stopword("they") -> true;
is_stopword("they'd") -> true;
is_stopword("they'll") -> true;
is_stopword("they're") -> true;
is_stopword("they've") -> true;
is_stopword("this") -> true;
is_stopword("those") -> true;
is_stopword("through") -> true;
is_stopword("to") -> true;
is_stopword("too") -> true;
is_stopword("under") -> true;
is_stopword("until") -> true;
is_stopword("up") -> true;
is_stopword("very") -> true;
is_stopword("was") -> true;
is_stopword("wasn't") -> true;
is_stopword("we") -> true;
is_stopword("we'd") -> true;
is_stopword("we'll") -> true;
is_stopword("we're") -> true;
is_stopword("we've") -> true;
is_stopword("were") -> true;
is_stopword("weren't") -> true;
is_stopword("what") -> true;
is_stopword("what's") -> true;
is_stopword("when") -> true;
is_stopword("when's") -> true;
is_stopword("where") -> true;
is_stopword("where's") -> true;
is_stopword("which") -> true;
is_stopword("while") -> true;
is_stopword("who") -> true;
is_stopword("who's") -> true;
is_stopword("whom") -> true;
is_stopword("why") -> true;
is_stopword("why's") -> true;
is_stopword("with") -> true;
is_stopword("won't") -> true;
is_stopword("would") -> true;
is_stopword("wouldn't") -> true;
is_stopword("you") -> true;
is_stopword("you'd") -> true;
is_stopword("you'll") -> true;
is_stopword("you're") -> true;
is_stopword("you've") -> true;
is_stopword("your") -> true;
is_stopword("yours") -> true;
is_stopword("yourself") -> true;
is_stopword("yourselves") -> true;
is_stopword(_) -> false.
-module(stopwords_map).
-export([is_stopword/1]).
is_stopword(S) ->
maps:get(
S,
#{
"a" => true,
"about" => true,
"above" => true,
"after" => true,
"again" => true,
"against" => true,
"all" => true,
"am" => true,
"an" => true,
"and" => true,
"any" => true,
"are" => true,
"aren't" => true,
"as" => true,
"at" => true,
"be" => true,
"because" => true,
"been" => true,
"before" => true,
"being" => true,
"below" => true,
"between" => true,
"both" => true,
"but" => true,
"by" => true,
"can't" => true,
"cannot" => true,
"could" => true,
"couldn't" => true,
"did" => true,
"didn't" => true,
"do" => true,
"does" => true,
"doesn't" => true,
"doing" => true,
"don't" => true,
"down" => true,
"during" => true,
"each" => true,
"few" => true,
"for" => true,
"from" => true,
"further" => true,
"had" => true,
"hadn't" => true,
"has" => true,
"hasn't" => true,
"have" => true,
"haven't" => true,
"having" => true,
"he" => true,
"he'd" => true,
"he'll" => true,
"he's" => true,
"her" => true,
"here" => true,
"here's" => true,
"hers" => true,
"herself" => true,
"him" => true,
"himself" => true,
"his" => true,
"how" => true,
"how's" => true,
"i" => true,
"i'd" => true,
"i'll" => true,
"i'm" => true,
"i've" => true,
"if" => true,
"in" => true,
"into" => true,
"is" => true,
"isn't" => true,
"it" => true,
"it's" => true,
"its" => true,
"itself" => true,
"let's" => true,
"me" => true,
"more" => true,
"most" => true,
"mustn't" => true,
"my" => true,
"myself" => true,
"no" => true,
"nor" => true,
"not" => true,
"of" => true,
"off" => true,
"on" => true,
"once" => true,
"only" => true,
"or" => true,
"other" => true,
"ought" => true,
"our" => true,
"ours" => true,
"ourselves" => true,
"out" => true,
"over" => true,
"own" => true,
"same" => true,
"shan't" => true,
"she" => true,
"she'd" => true,
"she'll" => true,
"she's" => true,
"should" => true,
"shouldn't" => true,
"so" => true,
"some" => true,
"such" => true,
"than" => true,
"that" => true,
"that's" => true,
"the" => true,
"their" => true,
"theirs" => true,
"them" => true,
"themselves" => true,
"then" => true,
"there" => true,
"there's" => true,
"these" => true,
"they" => true,
"they'd" => true,
"they'll" => true,
"they're" => true,
"they've" => true,
"this" => true,
"those" => true,
"through" => true,
"to" => true,
"too" => true,
"under" => true,
"until" => true,
"up" => true,
"very" => true,
"was" => true,
"wasn't" => true,
"we" => true,
"we'd" => true,
"we'll" => true,
"we're" => true,
"we've" => true,
"were" => true,
"weren't" => true,
"what" => true,
"what's" => true,
"when" => true,
"when's" => true,
"where" => true,
"where's" => true,
"which" => true,
"while" => true,
"who" => true,
"who's" => true,
"whom" => true,
"why" => true,
"why's" => true,
"with" => true,
"won't" => true,
"would" => true,
"wouldn't" => true,
"you" => true,
"you'd" => true,
"you'll" => true,
"you're" => true,
"you've" => true,
"your" => true,
"yours" => true,
"yourself" => true,
"yourselves" => true
},
false).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment