Skip to content

Instantly share code, notes, and snippets.

@alco
Created July 22, 2014 11:24
Show Gist options
  • Save alco/7f41bb8a9fd3b89e5b97 to your computer and use it in GitHub Desktop.
Save alco/7f41bb8a9fd3b89e5b97 to your computer and use it in GitHub Desktop.
defmodule R do
def sample_file_inefficient(path, count) do
s = File.stream!(path, [], :line)
sample = Enum.take(s, count) |> List.to_tuple
Stream.drop(s, count)
|> Stream.with_index
|> Enum.reduce(sample, fn {line, index}, sample ->
case random(index) do
r when r < count -> put_elem(sample, r, line)
_ -> sample
end
end)
|> Tuple.to_list
end
defp random(n) do
:random.uniform(n+1)-1
end
end
R.sample_file_inefficient("enwiki/enwiki-latest-all-titles-in-ns0", 10)
|> IO.inspect
import random
SAMPLE_COUNT = 10
# Force the value of the seed so the results are repeatable
random.seed(12345)
sample_titles = []
for index, line in enumerate(open("enwiki/enwiki-latest-all-titles-in-ns0")):
# Generate the reservoir
if index < SAMPLE_COUNT:
sample_titles.append(line)
else:
# Randomly replace elements in the reservoir
# with a decreasing probability.
# Choose an integer between 0 and index (inclusive)
r = random.randint(0, index)
if r < SAMPLE_COUNT:
sample_titles[r] = line
print sample_titles
$ du -h enwiki/enwiki-latest-all-titles-in-ns0
218M enwiki/enwiki-latest-all-titles-in-ns0
$ time python reservoir.py
['UFMA\n', 'Ark-La-Tex\n', 'Carolyn_George\n', 'Libby_Potter\n', 'Housepainters\n', 'Fayette_County_Courthouse_(Iowa)\n', 'Hussein_Darbouk\n', 'Ribeira_Palace\n', 'National_Road_574\n', 'Aphalangia\n']
python reservoir.py 23.71s user 0.08s system 99% cpu 23.789 total
$ time elixir reservoir.exs
["Congratulations_(album)\n", "2001_in_King_of_the_Cage\n", "Genero_chico\n",
"GSAT-5P\n", "Limavady_United_F_C\n", "Jeroboam_Rothschild\n",
"Rose_Hill,_Fairfax_County,_VA\n", "Agent_Helix_(G.I._Joe)\n",
"Domain_Athletic_Centre\n", "Pedro_de_cordoba\n"]
elixir reservoir.exs 42.44s user 0.43s system 100% cpu 42.490 total
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment