Last active
August 29, 2015 13:56
-
-
Save lqdc/9342237 to your computer and use it in GitHub Desktop.
word count comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function wordcount(text) | |
words=split(readall(text), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false) | |
counts=Dict() | |
for w in words | |
counts[w] = get(counts,w,0)+1 | |
end | |
return counts | |
end | |
tic();open(wordcount, "input1.txt");toc() | |
# elapsed time: 0.674914253 seconds | |
# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from collections import Counter | |
%time c = Counter(re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read())) | |
# CPU times: user 256 ms, sys: 12.5 ms, total: 269 ms | |
# Wall time: 255 ms | |
# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt | |
%time split_str = re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read() | |
# CPU times: user 79.5 ms, sys: 8.16 ms, total: 87.6 ms | |
# Wall time: 87.4 ms | |
%time c = Counter(split_str) | |
# CPU times: user 155 ms, sys: 43.1 ms, total: 198 ms | |
# Wall time: 170 ms | |
from collections import defaultdict | |
d = defaultdict(int) | |
%time d = for w in split_str: d[w] += 1; | |
# CPU times: user 149 ms, sys: 77.6 ms, total: 227 ms | |
# Wall time: 156 ms |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Base.hash | |
using DataStructures | |
hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s)) | |
function wordcounter(filename) | |
fid=open(filename); | |
text = readall(fid) | |
close(fid) | |
counts = counter(SubString{UTF8String}) | |
words=split(text, Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false) | |
for w in words | |
add!(counts,w) | |
end | |
return counts | |
end | |
@time wordcounter("input1.txt"); | |
# elapsed time: 0.599400593 seconds (92975344 bytes allocated) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Base.hash | |
hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s)) | |
function wordcount(fp) | |
@time words=split(readall(fp), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false) | |
counts=Dict{SubString{UTF8String},Int}() | |
@time for w in words | |
counts[w] = get(counts,w,0)+1 | |
end | |
return counts | |
end | |
@time open(wordcount, "input1.txt", "r"); | |
# elapsed time: 0.124112166 seconds (51663392 bytes allocated) | |
# elapsed time: 0.378415071 seconds (37785424 bytes allocated) | |
# elapsed time: 0.656286844 seconds (97647580 bytes allocated) | |
# takes 0.62 seconds without the @time within the function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment