Skip to content

Instantly share code, notes, and snippets.

@hdevalence
Created November 24, 2021 01:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hdevalence/45b42769096a2f54a3f5cdd81d1d3a25 to your computer and use it in GitHub Desktop.
Save hdevalence/45b42769096a2f54a3f5cdd81d1d3a25 to your computer and use it in GitHub Desktop.
download every crate
use std::path::PathBuf;
use futures::{stream::FuturesUnordered, StreamExt};
use git2::Repository;
use serde::Serialize;
use structopt::StructOpt;
#[derive(Debug, StructOpt)]
#[structopt(
name = "get-all-crates",
about = "clones repos for all crates on crates.io"
)]
struct Opt {
/// The crates.io database dump file, downloaded from https://static.crates.io/db-dump.tar.gz
#[structopt(short, long, parse(from_os_str), default_value = "./db-dump.tar.gz")]
db_file: PathBuf,
/// The root directory for the cloned repos.
#[structopt(short, long, parse(from_os_str))]
clone_root: PathBuf,
/// The number of concurrent clone tasks to perform.
#[structopt(short, long, default_value = "16")]
concurrent_clones: usize,
}
#[derive(Debug, Serialize)]
struct CrateRepo {
name: String,
repo: String,
}
fn main() -> anyhow::Result<()> {
tracing_subscriber::fmt::init();
let opt = Opt::from_args();
tracing::info!("loading crate data");
let mut repos = Vec::new();
db_dump::Loader::new()
.crates(|row| {
if let Some(repo) = row.repository {
repos.push(CrateRepo {
name: row.name,
repo,
});
}
})
.load("./db-dump.tar.gz")?;
let num_repos = repos.len() as u32;
tracing::info!(?num_repos, "finished loading crate data");
let runtime = tokio::runtime::Builder::new_multi_thread()
.max_blocking_threads(opt.concurrent_clones)
.enable_all()
.build()?;
let (tracker_tx, mut tracker_rx) =
tokio::sync::mpsc::channel::<(String, anyhow::Result<()>)>(64);
runtime.block_on(async move {
use tokio::task::{spawn, spawn_blocking};
let mut all_tasks = FuturesUnordered::new();
all_tasks.push(spawn(async move {
let mut success_count = 0;
let mut fail_count = 0;
while let Some((name, result)) = tracker_rx.recv().await {
match result {
Ok(()) => {
success_count += 1;
tracing::info!(?name, completion = ?(f64::from(success_count + fail_count)/f64::from(num_repos)), "successfully cloned crate");
}
Err(e) => {
fail_count += 1;
tracing::error!(?name, completion = ?(f64::from(success_count + fail_count)/f64::from(num_repos)), ?e);
}
};
}
}));
for CrateRepo { name, repo } in repos.into_iter() {
let mut clone_path = opt.clone_root.clone();
let padded_name = format!("{}____", name);
clone_path.push(&padded_name[0..2]);
clone_path.push(&padded_name[2..4]);
clone_path.push(&name);
let tx = tracker_tx.clone();
all_tasks.push(spawn_blocking(move || {
tracing::info!(?name, ?repo, ?clone_path, "starting clone");
match Repository::clone(repo.as_str(), clone_path) {
Ok(_) => tx.blocking_send((name, Ok(()))),
Err(e) => tx.blocking_send((name, Err(e.into()))),
}.unwrap();
}));
}
while let Some(_) = all_tasks.next().await {
}
Ok(())
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment