Created
November 24, 2021 01:56
-
-
Save hdevalence/45b42769096a2f54a3f5cdd81d1d3a25 to your computer and use it in GitHub Desktop.
download every crate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::path::PathBuf; | |
use futures::{stream::FuturesUnordered, StreamExt}; | |
use git2::Repository; | |
use serde::Serialize; | |
use structopt::StructOpt; | |
#[derive(Debug, StructOpt)] | |
#[structopt( | |
name = "get-all-crates", | |
about = "clones repos for all crates on crates.io" | |
)] | |
struct Opt { | |
/// The crates.io database dump file, downloaded from https://static.crates.io/db-dump.tar.gz | |
#[structopt(short, long, parse(from_os_str), default_value = "./db-dump.tar.gz")] | |
db_file: PathBuf, | |
/// The root directory for the cloned repos. | |
#[structopt(short, long, parse(from_os_str))] | |
clone_root: PathBuf, | |
/// The number of concurrent clone tasks to perform. | |
#[structopt(short, long, default_value = "16")] | |
concurrent_clones: usize, | |
} | |
#[derive(Debug, Serialize)] | |
struct CrateRepo { | |
name: String, | |
repo: String, | |
} | |
fn main() -> anyhow::Result<()> { | |
tracing_subscriber::fmt::init(); | |
let opt = Opt::from_args(); | |
tracing::info!("loading crate data"); | |
let mut repos = Vec::new(); | |
db_dump::Loader::new() | |
.crates(|row| { | |
if let Some(repo) = row.repository { | |
repos.push(CrateRepo { | |
name: row.name, | |
repo, | |
}); | |
} | |
}) | |
.load("./db-dump.tar.gz")?; | |
let num_repos = repos.len() as u32; | |
tracing::info!(?num_repos, "finished loading crate data"); | |
let runtime = tokio::runtime::Builder::new_multi_thread() | |
.max_blocking_threads(opt.concurrent_clones) | |
.enable_all() | |
.build()?; | |
let (tracker_tx, mut tracker_rx) = | |
tokio::sync::mpsc::channel::<(String, anyhow::Result<()>)>(64); | |
runtime.block_on(async move { | |
use tokio::task::{spawn, spawn_blocking}; | |
let mut all_tasks = FuturesUnordered::new(); | |
all_tasks.push(spawn(async move { | |
let mut success_count = 0; | |
let mut fail_count = 0; | |
while let Some((name, result)) = tracker_rx.recv().await { | |
match result { | |
Ok(()) => { | |
success_count += 1; | |
tracing::info!(?name, completion = ?(f64::from(success_count + fail_count)/f64::from(num_repos)), "successfully cloned crate"); | |
} | |
Err(e) => { | |
fail_count += 1; | |
tracing::error!(?name, completion = ?(f64::from(success_count + fail_count)/f64::from(num_repos)), ?e); | |
} | |
}; | |
} | |
})); | |
for CrateRepo { name, repo } in repos.into_iter() { | |
let mut clone_path = opt.clone_root.clone(); | |
let padded_name = format!("{}____", name); | |
clone_path.push(&padded_name[0..2]); | |
clone_path.push(&padded_name[2..4]); | |
clone_path.push(&name); | |
let tx = tracker_tx.clone(); | |
all_tasks.push(spawn_blocking(move || { | |
tracing::info!(?name, ?repo, ?clone_path, "starting clone"); | |
match Repository::clone(repo.as_str(), clone_path) { | |
Ok(_) => tx.blocking_send((name, Ok(()))), | |
Err(e) => tx.blocking_send((name, Err(e.into()))), | |
}.unwrap(); | |
})); | |
} | |
while let Some(_) = all_tasks.next().await { | |
} | |
Ok(()) | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment