Skip to content

Instantly share code, notes, and snippets.

@rkusa
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rkusa/b6d8c72a0a37494889da to your computer and use it in GitHub Desktop.
Save rkusa/b6d8c72a0a37494889da to your computer and use it in GitHub Desktop.
populate test database with github archive data
require 'open-uri'
require 'zlib'
require 'yajl'
require 'net/http'
require 'json'
Net::HTTP.start('localhost', 5984, read_timeout: 3600) do |http|
24.times do |d|
24.times do |h|
url = 'http://data.githubarchive.org/2015-03-%02d-%d.json.gz' % [d + 1, h]
puts 'importing %s' % url
gz = open(url)
js = Zlib::GzipReader.new(gz).read
body = '{"docs":[%s]}' % js.gsub!(/[\n]+/, ',').chomp(',');
# puts body
req = Net::HTTP::Post.new('/events/_bulk_docs')
req["content-type"] = "application/json"
req.body = body
http.request(req)
end
end
end
# curl http://localhost:5984/events -X DELETE
# curl http://localhost:5984/events -X PUT
# from http://data.githubarchive.org/2015-03-01-0.json.gz
# to http://data.githubarchive.org/2015-03-01-18.json.gz
curl http://localhost:5984/events
# {"doc_count":10047784,"data_size":12241214233}
# that are 11.4GB
curl http://localhost:5984/events/_design/index -X PUT -H 'Content-Type: application/json' -d '{"language":"javascript","views":{"login":{"map":"function(doc) { emit(doc.actor.login, null) }"}}}'
curl http://localhost:5984/_active_tasks
# [{"pid":"<0.25716.5>","changes_done":46965,"database":"events","design_document":"_design/index","progress":0,"started_on":1427314967,"total_changes":10047785,"type":"indexer","updated_on":1427315035}]
curl -o /dev/null -s -w "%{time_total}" http://localhost:5984/events/_design/index/_view/login\?key\=\"rkusa\"\&include_docs=true
# 0.117s
require 'open-uri'
require 'zlib'
require 'yajl'
require 'pg'
require 'json'
conn ||= PG.connect(dbname: 'yarvis')
conn.prepare 'insert_event', <<-SQL
INSERT INTO events VALUES(
DEFAULT,
$1
);
SQL
24.times do |d| # 1. - 24.
24.times do |h| # 24h
url = 'http://data.githubarchive.org/2015-01-%02d-%d.json.gz' % [d + 1, h]
puts 'importing %s' % url
gz = open(url)
js = Zlib::GzipReader.new(gz).read
Yajl::Parser.parse(js) do |event|
begin
conn.exec_prepared 'insert_event', [event.to_json]
rescue PG::UntranslatableCharacter
end
end
end
end
# DROP TABLE events;
# CREATE TABLE events (
# id BIGSERIAL PRIMARY KEY,
# event JSONB
# )
-- from http://data.githubarchive.org/2015-03-01-0.json.gz
-- to http://data.githubarchive.org/2015-03-01-18.json.gz
SELECT COUNT(*) FROM events;
--- 10.047.681
CREATE INDEX login_idx ON events ((event->'actor'->'login'));
SELECT
pg_size_pretty(pg_total_relation_size('events')) AS total,
pg_size_pretty(pg_relation_size('events')) AS table,
pg_size_pretty(pg_relation_size('login_idx')) AS login_index,
pg_size_pretty(pg_relation_size('event_idx')) AS event_index
-- total table index
-- 14 GB 9725 MB 364 MB
SELECT * FROM events WHERE event->'actor'->'login' = '"rkusa"';
-- 129 rows in 11ms
CREATE INDEX event_idx ON events USING gin(event jsonb_path_ops);
-- canceld index creation after 20h
-- SELECT * FROM events WHERE event @> '{"actor":{"login":"rkusa"}}';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment