Skip to content

Instantly share code, notes, and snippets.

@HuakunShen
Created December 13, 2024 01:06
Show Gist options
  • Save HuakunShen/4530b36a5934c22084cd44f0f16f021d to your computer and use it in GitHub Desktop.
Save HuakunShen/4530b36a5934c22084cd44f0f16f021d to your computer and use it in GitHub Desktop.
Surrealdb Insert Speed Comparison with sqlite
// https://datasets.imdbws.com/name.basics.tsv.gz
use anyhow::{Context, Result};
use csv::ReaderBuilder;
use rusqlite::{params, Connection};
use std::time::Instant;
fn main() -> Result<()> {
let start = Instant::now();
// Create SQLite database and table
let mut conn = Connection::open("imdb.db")?;
conn.execute(
"CREATE TABLE IF NOT EXISTS name_basics (
nconst TEXT PRIMARY KEY,
primary_name TEXT NOT NULL,
birth_year INTEGER,
death_year INTEGER,
primary_profession TEXT,
known_for_titles TEXT
)",
[],
)?;
// Begin transaction for faster inserts
let tx = conn.transaction()?;
// Prepare the insert statement
let mut stmt: rusqlite::Statement<'_> = tx.prepare(
"INSERT INTO name_basics (
nconst, primary_name, birth_year, death_year, primary_profession, known_for_titles
) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
)?;
let csv_path = std::env::current_dir()?.join("../name.basics.tsv");
println!("CSV path: {:?}", csv_path);
// Create CSV reader
let mut rdr = ReaderBuilder::new()
.delimiter(b'\t') // IMDb files use tab as delimiter
.from_path(csv_path)?;
let mut count = 0;
// Process each record
for result in rdr.records() {
let record = result?;
let birth_year = match &record[2] {
"\\N" => None,
year => Some(year.parse::<i32>().context("Failed to parse birth year")?),
};
let death_year = match &record[3] {
"\\N" => None,
year => Some(year.parse::<i32>().context("Failed to parse death year")?),
};
stmt.execute(params![
&record[0], // nconst
&record[1], // primary_name
birth_year, // birth_year
death_year, // death_year
&record[4], // primary_profession
&record[5], // known_for_titles
])?;
count += 1;
if count % 100_000 == 0 {
println!("Processed {} records", count);
}
}
// Drop the prepared statement before committing to release the borrow
drop(stmt);
// Commit the transaction
tx.commit()?;
let duration = start.elapsed();
println!("Successfully imported {} records in {:?}", count, duration);
Ok(())
}
// https://datasets.imdbws.com/name.basics.tsv.gz
use anyhow::Result;
use csv::ReaderBuilder;
use serde::{Deserialize, Serialize};
use std::time::Instant;
use surrealdb::engine::local::RocksDb;
use surrealdb::Surreal;
#[derive(Debug, Serialize, Deserialize)]
struct NameBasic {
nconst: String,
primary_name: String,
birth_year: Option<i32>,
death_year: Option<i32>,
primary_profession: String,
known_for_titles: String,
}
#[tokio::main]
async fn main() -> Result<()> {
let start = Instant::now();
// Create database connection
let db_path = std::env::current_dir()?.join("imdb.surrealdb");
// if "imdb.surrealdb" exists, delete it
if std::path::Path::new(&db_path).exists() {
std::fs::remove_dir_all(&db_path)?;
}
let db = Surreal::new::<RocksDb>(db_path.clone()).await?;
db.use_ns("imdb").use_db("names").await?;
let csv_path = std::env::current_dir()?.join("../name.basics.tsv");
println!("CSV path: {:?}", csv_path);
// Count total rows first
let row_count = ReaderBuilder::new()
.delimiter(b'\t')
.from_path(&csv_path)?
.into_records()
.count();
println!("Total rows to process: {}", row_count);
// Create reader again for actual processing
let mut rdr = ReaderBuilder::new().delimiter(b'\t').from_path(csv_path)?;
let mut count = 0;
println!("Processing records...");
let tx = db.transaction().await?;
// Process each record
for result in rdr.records() {
let record = result?;
let birth_year = match &record[2] {
"\\N" => None,
year => Some(year.parse::<i32>()?),
};
let death_year = match &record[3] {
"\\N" => None,
year => Some(year.parse::<i32>()?),
};
// Create record in SurrealDB
let name = NameBasic {
nconst: record[0].to_string(),
primary_name: record[1].to_string(),
birth_year,
death_year,
primary_profession: record[4].to_string(),
known_for_titles: record[5].to_string(),
};
// Insert using the nconst as the record ID
(&tx)
.create::<Option<NameBasic>>("name_basic")
.content(name)
.await?;
count += 1;
if count % 100_000 == 0 {
println!("Processed {} NameBasics", count);
}
}
println!("Starting database insert...");
tx.commit().await?;
let duration = start.elapsed();
println!("Successfully imported {} records in {:?}", count, duration);
Ok(())
}
@Dhghomon
Copy link

Interesting that there is technically a .transaction() method, I guess it's a placeholder for the time being because it is hidden in the docs and there is a note saying that it isn't supported yet. At the moment a manual .query("BEGIN").query("CREATE person").query("COMMIT") is the way to use a manual transaction.

image

@HuakunShen
Copy link
Author

@Dhghomon With this syntax how can I insert a few millions of rows of data in a transaction?

@AlexFrid
Copy link

AlexFrid commented Jan 8, 2025

Hey @HuakunShen, while the client side transactions have not been implemented yet, you quickly create mock data like this in SurrealQL

CREATE |name_basics:3000000| CONTENT {
    nconst: rand::string(10),
    primary_name: rand::string(10),
    birth_year: rand::int(1970, 2024),
    death_year: rand::int(1970, 2024),
    primary_profession: rand::string(10),
    known_for_titles: rand::string(15),
};

To get more info on inserting data you can look at this lesson in the SurrealDB fundamentals course: https://surrealdb.com/learn/fundamentals/schemaless/inserting-data

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment