Skip to content

Instantly share code, notes, and snippets.

Created July 6, 2024 22:51
Show Gist options
  • Save trevorbernard/8ea34911b7aa4d6f1f93de1b318dd5b2 to your computer and use it in GitHub Desktop.
Save trevorbernard/8ea34911b7aa4d6f1f93de1b318dd5b2 to your computer and use it in GitHub Desktop.
Ingestion Experiment
use glob::glob;
use std::{
path::{Path, PathBuf}, time::Instant,
/// The BlockFileMetaData represents the parsed meta data from the
/// Precomputed Block filename.
/// Each mainnet precomputed block has the following grammar.
/// filename ::= network "-" block_height "-" state_hash ".json" ;
/// network ::= "mainnet" ;
/// block_height ::= digit+ ;
/// state_hash ::= "3N" alphanumeric{50} ;
/// digit ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ;
/// alpha ::= "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" |
/// "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" |
/// "w" | "x" | "y" | "z" | "A" | "B" | "C" | "D" | "E" | "F" | "G" |
/// "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
/// "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" | digit ;
pub struct BlockFileMetaData<'a> {
/// The block height of a precomputed block
pub(crate) height: usize,
/// The block network of the precomputed block
pub(crate) network: &'a str,
/// The block state hash of the precomputed block
pub(crate) state_hash: &'a str,
/// The [BlockFile] represents
pub struct BlockFile<'a> {
/// The underlying Path reference to the block
pub(crate) path: &'a Path,
/// The filename metadata for the block
pub(crate) metadata: BlockFileMetaData<'a>,
impl<'a> BlockFileMetaData<'a> {
/// Value precomputed block height
pub fn height(&self) -> usize {
/// Value precomputed block network
pub fn network(&self) -> &'a str {
/// Value precomputed block state hash
pub fn state_hash(&self) -> &'a str {
impl<'a> BlockFileMetaData<'a> {
/// Efficiently parse metadata from filename of a precomputed
/// block. It does rudimentary but fast validation ensuring the
/// block height gets parsed into a [`usize`] and the state_hash
/// starts with "3N" and is 52 characters long.
/// This function keeps reference from the underlying PathBuf
/// eliminating unnessary allocations.
pub fn from_filename<P: AsRef<Path> + 'a>(path: &'a P) -> Option<Self> {
let filename = path.as_ref().file_name()?.to_str()?;
// Ensure the filename ends with ".json"
if !filename.ends_with(".json") {
return None;
// Remove the ".json" extension
let name_without_ext = &filename[..filename.len() - 5];
// Find the positions of the hyphens
let first_dash = name_without_ext.find('-')?;
let second_dash = name_without_ext[first_dash + 1..].find('-')? + first_dash + 1;
// Extract the network, block height, and state hash
let network = &name_without_ext[..first_dash];
let height_str = &name_without_ext[first_dash + 1..second_dash];
let state_hash = &name_without_ext[second_dash + 1..];
// Parse the block height
let height = height_str.parse().ok()?;
// Ensure the state hash starts with "3N" and is 52 characters long
if !state_hash.starts_with("3N") || state_hash.len() != 52 {
return None;
Some(BlockFileMetaData {
/// The [BlockIngestionSummary] represents the the results of the
/// Precomputed block ingestion process
#[derive(Debug, Default, PartialEq)]
pub struct BlockIngestionSummary {
/// Value the total number of blocks in the ingestion directory
pub(crate) total_blocks_count: usize,
/// Value the total number of canonical blocks in the ingestion directory
pub(crate) total_canonical_blocks_count: usize,
/// Value the total number of orphaned blocks in the ingestion directory
pub(crate) total_orphaned_blocks_count: usize,
/// Value the total number of pending blocks in the ingestion directory
pub(crate) total_pending_blocks_count: usize,
/// It accepts a slice of [`AsRef<Path>`] and returns a
/// [`Vec<BlockFile<'a>`] in ascending order by block height
fn get_sorted_block_files<'a, P: AsRef<Path>>(
paths: &'a [P],
) -> anyhow::Result<Vec<BlockFile<'a>>> {
let mut block_files: Vec<_> = paths
.filter_map(|filename| {
BlockFileMetaData::from_filename(filename).map(|metadata| BlockFile {
path: filename.as_ref(),
// No need to use [`sort_by_cached_key`] here since the key function isn't
// expensive. Sort in ascending order
block_files.sort_by_key(|bf| bf.metadata.height);
/// Ingest a directory of Precomputed Blocks to create an initial
/// database.
/// This is a blocking operation that should be called from an OS
/// thread. If you are using a async runtime like [tokio], be aware
/// that this operation won't be cancelable since it uses blocking
/// I/O and will lock up a Runtime thread.
pub fn ingest_blocks<P: AsRef<Path>>(path: P) -> anyhow::Result<BlockIngestionSummary> {
// Validate that the path_ref is a none empty directory
let path_ref = path.as_ref();
if !path_ref.is_dir() {
log::warn!("path must be a directory: {}", path_ref.display());
return Ok(BlockIngestionSummary::default());
} else {
let mut entries = fs::read_dir(path_ref)?;
if {
log::warn!("path must have entries: {}", path_ref.display());
return Ok(BlockIngestionSummary::default());
// Find the canonical chain and ingest w/o adding to the witness tree
let time = Instant::now();
let pattern = format!("{}/*-*-*.json", path.as_ref().display());
let filenames: Vec<PathBuf> = glob(&pattern)?.filter_map(|x| x.ok()).collect();
let _sorted_block_files = match get_sorted_block_files(filenames.as_slice()) {
Ok(sorted) => sorted,
Err(e) => panic!("Unable to sort block files: {e}"),
let elapsed = time.elapsed();
let size = _sorted_block_files.len();
println!("Sorted block files {size} in: {elapsed:?} ms");
mod tests {
use super::BlockFileMetaData;
use crate::ingestion::{ingest_blocks, BlockIngestionSummary};
use tempfile::NamedTempFile;
fn test_foobar() -> anyhow::Result<()> {
let path = "/Users/tbernard/blocks/100000-blocks";
let _ = ingest_blocks(path);
fn test_ingestion_on_empty_dir() -> anyhow::Result<()> {
let tmp_dir = tempfile::tempdir().expect("empty tmp dir");
let path = tmp_dir.path();
let summary = ingest_blocks(path)?;
assert_eq!(BlockIngestionSummary::default(), summary);
fn test_ingestion_on_non_dir() -> anyhow::Result<()> {
let tmp_file = NamedTempFile::new().expect("tmp file");
let path = tmp_file.path();
let summary = ingest_blocks(path)?;
assert_eq!(BlockIngestionSummary::default(), summary);
fn test_invalid_block_height_filename() -> anyhow::Result<()> {
let filename = "mainnet-3596b04-3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15.json";
let block_file_meta = BlockFileMetaData::from_filename(&filename);
fn test_invalid_extension_filename() -> anyhow::Result<()> {
let filename = "mainnet-359604-3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15.foobar";
let block_file_meta = BlockFileMetaData::from_filename(&filename);
fn test_valid_filename() -> anyhow::Result<()> {
let filename = "mainnet-359604-3NLRTfY4kZyJtvaP4dFenDcxfoMfT3uEpkWS913KkeXLtziyVd15.json";
if let Some(block_file_meta) = BlockFileMetaData::from_filename(&filename) {
let network =;
let height = block_file_meta.height();
let state_hash = block_file_meta.state_hash();
assert_eq!("mainnet", network);
assert_eq!(359604_usize, height);
} else {
panic!("Unable to parse block file metadata");
fn test_valid_long_path() -> anyhow::Result<()> {
let filename = "tests/data/sequential_blocks/mainnet-105494-3NKXsaznJ6WdyA4PHfXxn25RzVanzQsNMZrxjidbhoBug8R4LZDy.json";
if let Some(block_file_meta) = BlockFileMetaData::from_filename(&filename) {
let network =;
let height = block_file_meta.height();
let state_hash = block_file_meta.state_hash();
assert_eq!("mainnet", network);
assert_eq!(105494_usize, height);
} else {
panic!("Unable to parse block file metadata");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment