Skip to content

Instantly share code, notes, and snippets.

@lqdc
Last active August 29, 2015 13:56
Show Gist options
  • Save lqdc/9342237 to your computer and use it in GitHub Desktop.
Save lqdc/9342237 to your computer and use it in GitHub Desktop.
word count comparison
function wordcount(text)
words=split(readall(text), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
counts=Dict()
for w in words
counts[w] = get(counts,w,0)+1
end
return counts
end
tic();open(wordcount, "input1.txt");toc()
# elapsed time: 0.674914253 seconds
# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt
import re
from collections import Counter
%time c = Counter(re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read()))
# CPU times: user 256 ms, sys: 12.5 ms, total: 269 ms
# Wall time: 255 ms
# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt
%time split_str = re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read()
# CPU times: user 79.5 ms, sys: 8.16 ms, total: 87.6 ms
# Wall time: 87.4 ms
%time c = Counter(split_str)
# CPU times: user 155 ms, sys: 43.1 ms, total: 198 ms
# Wall time: 170 ms
from collections import defaultdict
d = defaultdict(int)
%time d = for w in split_str: d[w] += 1;
# CPU times: user 149 ms, sys: 77.6 ms, total: 227 ms
# Wall time: 156 ms
import Base.hash
using DataStructures
hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s))
function wordcounter(filename)
fid=open(filename);
text = readall(fid)
close(fid)
counts = counter(SubString{UTF8String})
words=split(text, Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
for w in words
add!(counts,w)
end
return counts
end
@time wordcounter("input1.txt");
# elapsed time: 0.599400593 seconds (92975344 bytes allocated)
import Base.hash
hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s))
function wordcount(fp)
@time words=split(readall(fp), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
counts=Dict{SubString{UTF8String},Int}()
@time for w in words
counts[w] = get(counts,w,0)+1
end
return counts
end
@time open(wordcount, "input1.txt", "r");
# elapsed time: 0.124112166 seconds (51663392 bytes allocated)
# elapsed time: 0.378415071 seconds (37785424 bytes allocated)
# elapsed time: 0.656286844 seconds (97647580 bytes allocated)
# takes 0.62 seconds without the @time within the function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment