Created
November 30, 2014 02:16
-
-
Save harry-wood/c447982dbdc84f94f7cc to your computer and use it in GitHub Desktop.
Converts a changesets.osm file (experimental planet download) into a CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Converts a changesets.osm file (experimental planet download) into a CSV file. | |
# | |
# Supply filename or stream as parameter | |
# Output is written to changesets.csv (example lines reported to std out) | |
# | |
# To load the result into postgres: | |
# CREATE TABLE changesets (uid BIGINT, created TIMESTAMP, min_lat FLOAT, max_lat FLOAT, min_lon FLOAT, max_lon FLOAT, changes INTEGER); | |
# COPY changesets FROM '/vagrant/changesets.csv' DELIMITER ',' CSV; | |
# | |
# then you can do fun queries like this user ranking | |
# COPY (SELECT uid, SUM(changes) AS sum FROM changesets WHERE changes>0 | |
# AND created>'2014-03-01' | |
# AND max_lat<12.69 | |
# AND min_lat>4.98 | |
# AND max_lon<-7.48 | |
# AND min_lon>-17.92 | |
# GROUP BY uid ORDER BY sum DESC | |
# ) TO '/tmp/longtail.csv'; | |
# | |
require 'time' | |
outputCSVfile = File.open("changesets.csv", 'w') | |
user_tally = { } | |
count = 0 | |
line = "" | |
ARGF.each do |line| | |
begin | |
if line =~ /<changeset/ and line !~ /num_changes=\"0\"/ | |
count +=1 | |
#puts count.to_s + " " + line | |
#bad harry. writing his own XML parser | |
id, rest = line.split("id=\"")[1].split("\"") | |
user, rest = line.split("user=\"")[1].split("\"") unless line !~ /user=\"/ | |
uid, rest = line.split("uid=\"")[1].split("\"") unless line !~ /uid=\"/ | |
created_at, rest = line.split("created_at=\"")[1].split("\"") | |
open, rest = line.split("open=\"")[1].split("\"") | |
num_changes, rest = line.split("num_changes=\"")[1].split("\"") unless line !~ /num_changes=\"/ | |
if open!="true" | |
closed_at, rest = line.split("closed_at=\"")[1].split("\"") | |
min_lat, rest = line.split("min_lat=\"")[1].split("\"") unless line !~ /min_lat=\"/ | |
max_lat, rest = line.split("max_lat=\"")[1].split("\"") unless line !~ /max_lat=\"/ | |
min_lon, rest = line.split("min_lon=\"")[1].split("\"") unless line !~ /min_lon=\"/ | |
max_lon, rest = line.split("max_lon=\"")[1].split("\"") unless line !~ /max_lon=\"/ | |
end | |
outputCSVfile.puts uid.to_s + ",\"" + created_at + "\"," + min_lat.to_s + "," + max_lat.to_s + "," + min_lon.to_s + "," + max_lon.to_s + "," + num_changes.to_s | |
if count % 50000==0 | |
p count.to_s + ": " + id + ",\"" + user.to_s + "\"," + uid.to_s + ",\"" + created_at + "\"," + open + "," + | |
min_lat.to_s + "," + max_lat.to_s + "," + min_lon.to_s + "," + max_lon.to_s + "," + num_changes.to_s | |
end | |
end | |
rescue | |
puts count.to_s + " LINE:" + line | |
raise | |
end | |
end | |
outputCSVfile.close |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment