Winslett/Gemfile Secret

## README.md

      
    Raw
  

              README.md
            
          
    Data generator for Postgres Hyperloglog

The generator.rb file in this repo will create semi-random activity
data that can be used for experimenting with hyperloglog.
Crunchy Data has a full HLL tutorial at https://crunchydata.com/blog/high-compression-metrics-stograge-with-postgres-hyperloglog
What will happen if I run this?

By running this, you'll have a continuously running process that
generates two tables: customers and activities. Then, it adds data to
those customers and activities data in a way that resembles behavior.
The automatically created customers table looks like this:
                          Table "public.customers"
 Column |  Type   | Collation | Nullable |             Default
--------+---------+-----------+----------+----------------------------------
 id     | integer |           | not null | generated by default as identity
 name   | text    |           | not null |
 email  | text    |           | not null |

The automatically created activities table looks like this:
                         Table "public.activities"
   Column    |            Type             | Collation | Nullable | Default
-------------+-----------------------------+-----------+----------+---------
 received_at | timestamp without time zone |           | not null |
 customer_id | integer                     |           | not null |
 action      | text                        |           | not null |

How to run:

I expect you have a decently functioning Ruby environment.  If not,
consult a tutorial (like RubyMine's tutorial Set up a Ruby development environment).

Clone this repo: git clone git@gist.github.com:8c594d836c7a802bff1f0749e67c8ba4.git
Install the Postgres & Faker gem: bundler install
Run the generator with a database:

DATABASE_URL="postgres://user:pass@host:port/dbname" ruby generator.rb

Once this works, you'll be off and running to test Hyperloglog on
Postgres.

  
## active_customers.sql
SELECT
  id,
  name,
  email
FROM customers WHERE id IN (SELECT DISTINCT customer_id FROM activities WHERE activities.received_at > LEAST(now(), ?) - '1 day'::interval);

## Gemfile
source "https://rubygems.org"

gem 'faker'
gem 'sequel'
gem 'pg'

## Gemfile.lock
GEM
  remote: https://rubygems.org/
  specs:
    concurrent-ruby (1.2.2)
    faker (3.2.0)
      i18n (>= 1.8.11, < 2)
    i18n (1.14.0)
      concurrent-ruby (~> 1.0)
    pg (1.5.3)
    sequel (5.69.0)

PLATFORMS
  arm64-darwin-22

DEPENDENCIES
  faker
  pg
  sequel

BUNDLED WITH
   2.4.10

## generator.rb
require 'bundler'
Bundler.require

DB = Sequel.connect(ENV['DATABASE_URL'])

DB.create_table? :customers do
  primary_key :id
  String :name, null: false
  String :email, unique: true, null: false
end

DB.create_table? :activities do
  DateTime :received_at, null: false
  foreign_key :customer_id, :customers, null: false
  String :action, null: false
end

ACTIONS = ["show", "wander", "join", "greet", "celebrate", "ignite",  "climb", "nap", "sabotage", "take", "walk", "leap"]

time = nil
DB.fetch("SELECT max(received_at) AS latest_received FROM activities") do |row|
  time = row[:latest_received]
end
time ||= Time.parse('2023-01-01 00:00:00')

puts "Loading existing active customers"
active_customers = []
DB.fetch(File.read('active_customers.sql'), time) do |row|
  active_customers << row
end

puts "Loading existing inactive customers"
inactive_customers = []
DB.fetch(File.read('inactive_customers.sql'), time) do |row|
  inactive_customers << row
end

while true do
  puts time.inspect
  if time < Time.now
    time = [time + Kernel.rand(1800), Time.now].min
  else
    print "waiting"
    1.upto(60) { print "."; sleep 1}
    next
  end

  1.upto(([1, active_customers.length * 0.01 * Kernel.rand].max) * Math.log(Kernel.rand) * -1).each do
    customer = { name: Faker::Name.name, email: Faker::Internet.email.gsub(/@/, "#{Kernel.rand(10000)}@") }
    customer[:id] = DB[:customers].insert(customer)
    active_customers << customer
  end

  1.upto(active_customers.length * (Kernel.rand / 100.0)).each do
    inactive_customers << active_customers.delete_at(active_customers.length * Kernel.rand)
  end

  1.upto(inactive_customers.length * (Kernel.rand / 100.0)).each do
    active_customers << inactive_customers.delete_at(inactive_customers.length * Kernel.rand)
  end

  active_customers.shuffle[0..[Kernel.rand(active_customers.count), 1].max].each do |customer|
    time += Kernel.rand(1000.0 / active_customers.count)
    next if time > Time.now
    DB[:activities].insert({received_at: time, customer_id: customer[:id], action: ACTIONS.sample})
  end

  puts '--------------'
  puts ["Active Customers", active_customers.count].join(" : ")
  puts ["Inactive Customers", inactive_customers.count].join(" : ")
end

## inactive_customers.sql
SELECT
  id,
  name,
  email
FROM customers
WHERE id IN (SELECT DISTINCT customer_id FROM activities WHERE activities.received_at < LEAST(now(), ?) - '1 day'::interval);
	SELECT
	id,
	name,
	email
	FROM customers WHERE id IN (SELECT DISTINCT customer_id FROM activities WHERE activities.received_at > LEAST(now(), ?) - '1 day'::interval);
	source "https://rubygems.org"

	gem 'faker'
	gem 'sequel'
	gem 'pg'
	GEM
	remote: https://rubygems.org/
	specs:
	concurrent-ruby (1.2.2)
	faker (3.2.0)
	i18n (>= 1.8.11, < 2)
	i18n (1.14.0)
	concurrent-ruby (~> 1.0)
	pg (1.5.3)
	sequel (5.69.0)

	PLATFORMS
	arm64-darwin-22

	DEPENDENCIES
	faker
	pg
	sequel

	BUNDLED WITH
	2.4.10
	require 'bundler'
	Bundler.require

	DB = Sequel.connect(ENV['DATABASE_URL'])

	DB.create_table? :customers do
	primary_key :id
	String :name, null: false
	String :email, unique: true, null: false
	end

	DB.create_table? :activities do
	DateTime :received_at, null: false
	foreign_key :customer_id, :customers, null: false
	String :action, null: false
	end

	ACTIONS = ["show", "wander", "join", "greet", "celebrate", "ignite", "climb", "nap", "sabotage", "take", "walk", "leap"]

	time = nil
	DB.fetch("SELECT max(received_at) AS latest_received FROM activities") do \|row\|
	time = row[:latest_received]
	end
	time \|\|= Time.parse('2023-01-01 00:00:00')

	puts "Loading existing active customers"
	active_customers = []
	DB.fetch(File.read('active_customers.sql'), time) do \|row\|
	active_customers << row
	end

	puts "Loading existing inactive customers"
	inactive_customers = []
	DB.fetch(File.read('inactive_customers.sql'), time) do \|row\|
	inactive_customers << row
	end

	while true do
	puts time.inspect
	if time < Time.now
	time = [time + Kernel.rand(1800), Time.now].min
	else
	print "waiting"
	1.upto(60) { print "."; sleep 1}
	next
	end

	1.upto(([1, active_customers.length * 0.01 * Kernel.rand].max) * Math.log(Kernel.rand) * -1).each do
	customer = { name: Faker::Name.name, email: Faker::Internet.email.gsub(/@/, "#{Kernel.rand(10000)}@") }
	customer[:id] = DB[:customers].insert(customer)
	active_customers << customer
	end

	1.upto(active_customers.length * (Kernel.rand / 100.0)).each do
	inactive_customers << active_customers.delete_at(active_customers.length * Kernel.rand)
	end

	1.upto(inactive_customers.length * (Kernel.rand / 100.0)).each do
	active_customers << inactive_customers.delete_at(inactive_customers.length * Kernel.rand)
	end

	active_customers.shuffle[0..[Kernel.rand(active_customers.count), 1].max].each do \|customer\|
	time += Kernel.rand(1000.0 / active_customers.count)
	next if time > Time.now
	DB[:activities].insert({received_at: time, customer_id: customer[:id], action: ACTIONS.sample})
	end

	puts '--------------'
	puts ["Active Customers", active_customers.count].join(" : ")
	puts ["Inactive Customers", inactive_customers.count].join(" : ")
	end