lqdc/wc.jl

## wc.jl
function wordcount(text)
     words=split(readall(text), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
     counts=Dict()
     for w in words
         counts[w] = get(counts,w,0)+1
     end
     return counts
end
tic();open(wordcount, "input1.txt");toc()

# elapsed time: 0.674914253 seconds
# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt

## wc.py
import re
from collections import Counter

%time c = Counter(re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read()))
# CPU times: user 256 ms, sys: 12.5 ms, total: 269 ms
# Wall time: 255 ms
# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt

%time split_str = re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read()
# CPU times: user 79.5 ms, sys: 8.16 ms, total: 87.6 ms
# Wall time: 87.4 ms
%time c = Counter(split_str)
# CPU times: user 155 ms, sys: 43.1 ms, total: 198 ms
# Wall time: 170 ms
from collections import defaultdict
d = defaultdict(int)
%time d = for w in split_str: d[w] += 1;
# CPU times: user 149 ms, sys: 77.6 ms, total: 227 ms
# Wall time: 156 ms

## wc2.jl
import Base.hash
using DataStructures
hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s))

function wordcounter(filename)
    fid=open(filename);
    text = readall(fid)
    close(fid)
    counts = counter(SubString{UTF8String})
    words=split(text, Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
    for w in words
        add!(counts,w)
    end
    return counts
end

@time wordcounter("input1.txt");
# elapsed time: 0.599400593 seconds (92975344 bytes allocated)

## wc3.jl
import Base.hash
hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s))

function wordcount(fp)
     @time words=split(readall(fp), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
     counts=Dict{SubString{UTF8String},Int}()
     @time for w in words
         counts[w] = get(counts,w,0)+1
     end
     return counts
end
@time open(wordcount, "input1.txt", "r");
# elapsed time: 0.124112166 seconds (51663392 bytes allocated)
# elapsed time: 0.378415071 seconds (37785424 bytes allocated)
# elapsed time: 0.656286844 seconds (97647580 bytes allocated)

# takes 0.62 seconds without the @time within the function
	function wordcount(text)
	words=split(readall(text), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
	counts=Dict()
	for w in words
	counts[w] = get(counts,w,0)+1
	end
	return counts
	end
	tic();open(wordcount, "input1.txt");toc()

	# elapsed time: 0.674914253 seconds
	# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt
	import re
	from collections import Counter

	%time c = Counter(re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read()))
	# CPU times: user 256 ms, sys: 12.5 ms, total: 269 ms
	# Wall time: 255 ms
	# input1.txt is http://www.gutenberg.org/cache/epub/2600/pg2600.txt

	%time split_str = re.split('[ \n\r\t-.,:_";!]', open("input1.txt").read()
	# CPU times: user 79.5 ms, sys: 8.16 ms, total: 87.6 ms
	# Wall time: 87.4 ms
	%time c = Counter(split_str)
	# CPU times: user 155 ms, sys: 43.1 ms, total: 198 ms
	# Wall time: 170 ms
	from collections import defaultdict
	d = defaultdict(int)
	%time d = for w in split_str: d[w] += 1;
	# CPU times: user 149 ms, sys: 77.6 ms, total: 227 ms
	# Wall time: 156 ms
	import Base.hash
	using DataStructures
	hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s))

	function wordcounter(filename)
	fid=open(filename);
	text = readall(fid)
	close(fid)
	counts = counter(SubString{UTF8String})
	words=split(text, Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
	for w in words
	add!(counts,w)
	end
	return counts
	end

	@time wordcounter("input1.txt");
	# elapsed time: 0.599400593 seconds (92975344 bytes allocated)
	import Base.hash
	hash{T<:ByteString}(s::SubString{T}) = ccall(:memhash, Uint64, (Ptr{Void}, Int), pointer(s), sizeof(s))

	function wordcount(fp)
	@time words=split(readall(fp), Set([' ','\n','\r','\t','-','.',',',':','_','"',';','!']),false)
	counts=Dict{SubString{UTF8String},Int}()
	@time for w in words
	counts[w] = get(counts,w,0)+1
	end
	return counts
	end
	@time open(wordcount, "input1.txt", "r");
	# elapsed time: 0.124112166 seconds (51663392 bytes allocated)
	# elapsed time: 0.378415071 seconds (37785424 bytes allocated)
	# elapsed time: 0.656286844 seconds (97647580 bytes allocated)

	# takes 0.62 seconds without the @time within the function