Last active
August 29, 2015 14:17
-
-
Save rkusa/b6d8c72a0a37494889da to your computer and use it in GitHub Desktop.
populate test database with github archive data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'zlib' | |
require 'yajl' | |
require 'net/http' | |
require 'json' | |
Net::HTTP.start('localhost', 5984, read_timeout: 3600) do |http| | |
24.times do |d| | |
24.times do |h| | |
url = 'http://data.githubarchive.org/2015-03-%02d-%d.json.gz' % [d + 1, h] | |
puts 'importing %s' % url | |
gz = open(url) | |
js = Zlib::GzipReader.new(gz).read | |
body = '{"docs":[%s]}' % js.gsub!(/[\n]+/, ',').chomp(','); | |
# puts body | |
req = Net::HTTP::Post.new('/events/_bulk_docs') | |
req["content-type"] = "application/json" | |
req.body = body | |
http.request(req) | |
end | |
end | |
end | |
# curl http://localhost:5984/events -X DELETE | |
# curl http://localhost:5984/events -X PUT |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from http://data.githubarchive.org/2015-03-01-0.json.gz | |
# to http://data.githubarchive.org/2015-03-01-18.json.gz | |
curl http://localhost:5984/events | |
# {"doc_count":10047784,"data_size":12241214233} | |
# that are 11.4GB | |
curl http://localhost:5984/events/_design/index -X PUT -H 'Content-Type: application/json' -d '{"language":"javascript","views":{"login":{"map":"function(doc) { emit(doc.actor.login, null) }"}}}' | |
curl http://localhost:5984/_active_tasks | |
# [{"pid":"<0.25716.5>","changes_done":46965,"database":"events","design_document":"_design/index","progress":0,"started_on":1427314967,"total_changes":10047785,"type":"indexer","updated_on":1427315035}] | |
curl -o /dev/null -s -w "%{time_total}" http://localhost:5984/events/_design/index/_view/login\?key\=\"rkusa\"\&include_docs=true | |
# 0.117s |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'zlib' | |
require 'yajl' | |
require 'pg' | |
require 'json' | |
conn ||= PG.connect(dbname: 'yarvis') | |
conn.prepare 'insert_event', <<-SQL | |
INSERT INTO events VALUES( | |
DEFAULT, | |
$1 | |
); | |
SQL | |
24.times do |d| # 1. - 24. | |
24.times do |h| # 24h | |
url = 'http://data.githubarchive.org/2015-01-%02d-%d.json.gz' % [d + 1, h] | |
puts 'importing %s' % url | |
gz = open(url) | |
js = Zlib::GzipReader.new(gz).read | |
Yajl::Parser.parse(js) do |event| | |
begin | |
conn.exec_prepared 'insert_event', [event.to_json] | |
rescue PG::UntranslatableCharacter | |
end | |
end | |
end | |
end | |
# DROP TABLE events; | |
# CREATE TABLE events ( | |
# id BIGSERIAL PRIMARY KEY, | |
# event JSONB | |
# ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- from http://data.githubarchive.org/2015-03-01-0.json.gz | |
-- to http://data.githubarchive.org/2015-03-01-18.json.gz | |
SELECT COUNT(*) FROM events; | |
--- 10.047.681 | |
CREATE INDEX login_idx ON events ((event->'actor'->'login')); | |
SELECT | |
pg_size_pretty(pg_total_relation_size('events')) AS total, | |
pg_size_pretty(pg_relation_size('events')) AS table, | |
pg_size_pretty(pg_relation_size('login_idx')) AS login_index, | |
pg_size_pretty(pg_relation_size('event_idx')) AS event_index | |
-- total table index | |
-- 14 GB 9725 MB 364 MB | |
SELECT * FROM events WHERE event->'actor'->'login' = '"rkusa"'; | |
-- 129 rows in 11ms | |
CREATE INDEX event_idx ON events USING gin(event jsonb_path_ops); | |
-- canceld index creation after 20h | |
-- SELECT * FROM events WHERE event @> '{"actor":{"login":"rkusa"}}'; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment