Last active
June 15, 2018 07:04
-
-
Save arbenson/e0a189774f035d8f37d029f2f038fa8b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using LightXML | |
function parse_tags(filename::AbstractString) | |
tag_map = Dict{String, Int64}() | |
function get_tag_key(tag::AbstractString) | |
if !haskey(tag_map, tag) | |
n = length(tag_map) + 1 | |
tag_map[tag] = n | |
return n | |
end | |
return tag_map[tag] | |
end | |
nverts = Dict{Int64, Vector{Int64}}() | |
simplices = Dict{Int64, Vector{Int64}}() | |
times = Dict{Int64, Vector{Int64}}() | |
f = open(filename) | |
for (lineno, line) in enumerate(eachline(f)) | |
if lineno % 100000 == 0; println("$lineno"); end | |
if contains(line, "<row Id") | |
parsed_line = parse_string(line) | |
lroot = root(parsed_line) | |
if parse(Int64, attribute(lroot, "PostTypeId")) != 1; continue; end | |
tags = attribute(lroot, "Tags") | |
user = attribute(lroot, "OwnerUserId") | |
if tags != nothing && user != nothing | |
user = parse(Int64, user) | |
if !haskey(nverts, user) | |
nverts[user] = Vector{Int64}() | |
simplices[user] = Vector{Int64}() | |
times[user] = Vector{Int64}() | |
end | |
# Parse tags and map to integers | |
try tags = split(tags[2:end-1], "><") | |
catch continue | |
end | |
tag_keys = [get_tag_key(tag) for tag in tags] | |
timestamp = Dates.DateTime(attribute(lroot, "CreationDate")) | |
append!(simplices[user], tag_keys) | |
push!(nverts[user], length(tag_keys)) | |
push!(times[user], timestamp.instant.periods.value) | |
post_id = parse(Int64, attribute(lroot, "Id")) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment