Created
December 2, 2014 15:06
-
-
Save yeahnoob/3863a0c910a5c91ceac3 to your computer and use it in GitHub Desktop.
wordpairs optimization use custom hash index
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Commentary: | |
### With JuliaLang v0.3.3 | |
# English words' Hash Function borrowed from http://www.cse.yorku.ca/~oz/hash.html | |
function hashdjb2(t::String) | |
### used for english words' hash | |
## ha = convert(Uint,5381) | |
ha = 0x0000000000001505 | |
for c in t | |
ha = ((ha << 5) + ha)>>3 + int(c) | |
end | |
ha | |
end | |
function processdata!( filename::String, idx::Dict{Uint, Array{Uint}}; myhash::Function = hashdjb2 ) | |
### result store in index variable "idx" | |
## sort lines | |
W = String[] | |
W = readdlm(filename, ' ', String) | |
const linescount = size(W,1) | |
## get a index, [ HashValueOfKeyword ---> [MatchedLinesNumber, ...] ] | |
for i in 1:linescount | |
key = myhash(W[i,1]) | |
if haskey(idx,key) | |
push!(idx[key], i) | |
else | |
idx[key] = [i] | |
end | |
end | |
end | |
function bench( loops::Int64, filename::String, keyword::String; mybenchhash::Function = hashdjb2 ) | |
### benchmark processdata() function | |
println("\n... Process [", filename, "] and count pairs with [", keyword, "] for [", loops, "] Times ...") | |
println("... hash words with function [", string(mybenchhash), "]") | |
for i = 1:loops | |
global_idx = Dict{Uint, Array{Uint}}() | |
sizehint(global_idx, typemax(Uint16)) | |
@time processdata!(filename, global_idx; myhash = mybenchhash) | |
println( "...... pairs of [", keyword, "] = ", length(global_idx[mybenchhash(keyword)]) ) | |
end | |
end | |
bench( int(ARGS[1]), "wordpairs.txt", string(ARGS[2]) ) | |
## Around 10% slower if using Julia's default hash() function | |
#bench( int(ARGS[1]), "wordpairs.txt", string(ARGS[2]); mybenchhash = hash ) | |
None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment