Skip to content

Instantly share code, notes, and snippets.

@mohe2015
Last active April 9, 2022 18:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mohe2015/acc0f6daef95f3fcb1f402c2a71d2433 to your computer and use it in GitHub Desktop.
Save mohe2015/acc0f6daef95f3fcb1f402c2a71d2433 to your computer and use it in GitHub Desktop.
use std::io::{self, BufReader};
use flate2::read::GzDecoder;
use std::io::prelude::*;
use std::fs::File;
use rustc_hash::FxHashMap;
use base64::{decode_config_slice, STANDARD};
// cat /home/pi/2022_place_canvas_history.csv.gzip | gunzip | wc -c
// 21714634193
// ~22 GB
fn main() -> io::Result<()> {
let stdout = io::stdout();
let mut handle = stdout.lock();
let f = File::open("/home/pi/2022_place_canvas_history.csv.gzip")?;
let mut gz = BufReader::new(GzDecoder::new(f));
let mut next_user_id = -1;
let mut next_pixel_color = -1;
let mut user_ids = FxHashMap::default();
let mut pixel_colors = FxHashMap::default();
let mut line = Vec::<u8>::new();
let mut number_of_bytes_read = 0;
gz.read_until(b'\n', &mut line)?; // drop header
line.clear();
loop {
let line_length = gz.read_until(b'\n', &mut line)?;
if line_length == 0 {
break;
}
number_of_bytes_read += line_length;
if number_of_bytes_read % (16 * 1024) == 0 {
eprintln!("{} %", f64::from(u32::try_from(number_of_bytes_read).unwrap()) / 21714634193f64);
}
if number_of_bytes_read >= 2171463419 {
break; // TESTING
}
// almost everything in a line should be at a fixed offset so maybe cheat
let mut it = line.split(|c| *c == b',');
let timestamp = it.next().unwrap();
let mut user_id = vec![0; 64];
assert_eq!(64, decode_config_slice(it.next().unwrap(), STANDARD, &mut user_id).unwrap());
let user_id = match user_ids.get(&user_id) {
Some(v) => *v,
None => {
next_user_id += 1;
user_ids.insert(user_id.clone(), next_user_id);
next_user_id
}
};
let pixel_color = u32::from_str_radix(std::str::from_utf8(&it.next().unwrap()[1..]).unwrap(), 16).unwrap();
// TODO maybe hex decode or store statically as the colors should be known
let pixel_color = match pixel_colors.get(&pixel_color) {
Some(v) => *v,
None => {
next_pixel_color += 1;
pixel_colors.insert(pixel_color, next_pixel_color);
next_pixel_color
}
};
let coordinate_x = std::str::from_utf8(&it.next().unwrap()[1..]).unwrap().parse::<u16>().unwrap();
let coordinate_y = it.next().unwrap();
let coordinate_y = std::str::from_utf8(&coordinate_y[..coordinate_y.len()-2]).unwrap().parse::<u16>().unwrap();
writeln!(handle, "{},{},{},{},{}", std::str::from_utf8(timestamp).unwrap(), user_id, pixel_color, coordinate_x, coordinate_y)?;
line.clear();
}
eprintln!("{:#?}", pixel_colors);
eprintln!("user count: {}", next_user_id);
Ok(())
}
sudo apt install -y postgresql
curl -O https://placedata.reddit.com/data/canvas-history/2022_place_canvas_history.csv.gzip
sudo -u postgres psql --user postgres
CREATE TABLE rplace (coordinate POINT NOT NULL, pixel_color CHAR(7) NOT NULL, user_id BYTEA NOT NULL, timestamp TIMESTAMP WITHOUT TIME ZONE NOT NULL);
COPY rplace (timestamp, user_id, pixel_color, coordinate) FROM PROGRAM 'gunzip -c /home/pi/2022_place_canvas_history.csv.gzip | grep -v -P "\"\d+,\d+,\d+,\d+\""' WITH CSV HEADER;
VACUUM (VERBOSE, ANALYZE) rplace;
EXPLAIN ANALYZE SELECT COUNT(*) FROM rplace;
set maintenance_work_mem TO "4GB";
CREATE INDEX timestamp_idx ON rplace USING BRIN (timestamp);
CREATE INDEX coordinate_idx ON rplace USING GIST (coordinate);
CREATE INDEX user_id_idx ON rplace USING HASH (user_id);
CREATE INDEX pixel_color_idx ON rplace USING HASH (pixel_color);
ALTER SYSTEM SET maintenance_work_mem = '4GB';
SELECT pg_reload_conf();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment