Skip to content

Instantly share code, notes, and snippets.

@arbenson
Last active June 15, 2018 07:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arbenson/e0a189774f035d8f37d029f2f038fa8b to your computer and use it in GitHub Desktop.
Save arbenson/e0a189774f035d8f37d029f2f038fa8b to your computer and use it in GitHub Desktop.
using LightXML
function parse_tags(filename::AbstractString)
tag_map = Dict{String, Int64}()
function get_tag_key(tag::AbstractString)
if !haskey(tag_map, tag)
n = length(tag_map) + 1
tag_map[tag] = n
return n
end
return tag_map[tag]
end
nverts = Dict{Int64, Vector{Int64}}()
simplices = Dict{Int64, Vector{Int64}}()
times = Dict{Int64, Vector{Int64}}()
f = open(filename)
for (lineno, line) in enumerate(eachline(f))
if lineno % 100000 == 0; println("$lineno"); end
if contains(line, "<row Id")
parsed_line = parse_string(line)
lroot = root(parsed_line)
if parse(Int64, attribute(lroot, "PostTypeId")) != 1; continue; end
tags = attribute(lroot, "Tags")
user = attribute(lroot, "OwnerUserId")
if tags != nothing && user != nothing
user = parse(Int64, user)
if !haskey(nverts, user)
nverts[user] = Vector{Int64}()
simplices[user] = Vector{Int64}()
times[user] = Vector{Int64}()
end
# Parse tags and map to integers
try tags = split(tags[2:end-1], "><")
catch continue
end
tag_keys = [get_tag_key(tag) for tag in tags]
timestamp = Dates.DateTime(attribute(lroot, "CreationDate"))
append!(simplices[user], tag_keys)
push!(nverts[user], length(tag_keys))
push!(times[user], timestamp.instant.periods.value)
post_id = parse(Int64, attribute(lroot, "Id"))
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment