Created
December 13, 2024 01:06
-
-
Save HuakunShen/4530b36a5934c22084cd44f0f16f021d to your computer and use it in GitHub Desktop.
Surrealdb Insert Speed Comparison with sqlite
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://datasets.imdbws.com/name.basics.tsv.gz | |
use anyhow::{Context, Result}; | |
use csv::ReaderBuilder; | |
use rusqlite::{params, Connection}; | |
use std::time::Instant; | |
fn main() -> Result<()> { | |
let start = Instant::now(); | |
// Create SQLite database and table | |
let mut conn = Connection::open("imdb.db")?; | |
conn.execute( | |
"CREATE TABLE IF NOT EXISTS name_basics ( | |
nconst TEXT PRIMARY KEY, | |
primary_name TEXT NOT NULL, | |
birth_year INTEGER, | |
death_year INTEGER, | |
primary_profession TEXT, | |
known_for_titles TEXT | |
)", | |
[], | |
)?; | |
// Begin transaction for faster inserts | |
let tx = conn.transaction()?; | |
// Prepare the insert statement | |
let mut stmt: rusqlite::Statement<'_> = tx.prepare( | |
"INSERT INTO name_basics ( | |
nconst, primary_name, birth_year, death_year, primary_profession, known_for_titles | |
) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", | |
)?; | |
let csv_path = std::env::current_dir()?.join("../name.basics.tsv"); | |
println!("CSV path: {:?}", csv_path); | |
// Create CSV reader | |
let mut rdr = ReaderBuilder::new() | |
.delimiter(b'\t') // IMDb files use tab as delimiter | |
.from_path(csv_path)?; | |
let mut count = 0; | |
// Process each record | |
for result in rdr.records() { | |
let record = result?; | |
let birth_year = match &record[2] { | |
"\\N" => None, | |
year => Some(year.parse::<i32>().context("Failed to parse birth year")?), | |
}; | |
let death_year = match &record[3] { | |
"\\N" => None, | |
year => Some(year.parse::<i32>().context("Failed to parse death year")?), | |
}; | |
stmt.execute(params![ | |
&record[0], // nconst | |
&record[1], // primary_name | |
birth_year, // birth_year | |
death_year, // death_year | |
&record[4], // primary_profession | |
&record[5], // known_for_titles | |
])?; | |
count += 1; | |
if count % 100_000 == 0 { | |
println!("Processed {} records", count); | |
} | |
} | |
// Drop the prepared statement before committing to release the borrow | |
drop(stmt); | |
// Commit the transaction | |
tx.commit()?; | |
let duration = start.elapsed(); | |
println!("Successfully imported {} records in {:?}", count, duration); | |
Ok(()) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// https://datasets.imdbws.com/name.basics.tsv.gz | |
use anyhow::Result; | |
use csv::ReaderBuilder; | |
use serde::{Deserialize, Serialize}; | |
use std::time::Instant; | |
use surrealdb::engine::local::RocksDb; | |
use surrealdb::Surreal; | |
#[derive(Debug, Serialize, Deserialize)] | |
struct NameBasic { | |
nconst: String, | |
primary_name: String, | |
birth_year: Option<i32>, | |
death_year: Option<i32>, | |
primary_profession: String, | |
known_for_titles: String, | |
} | |
#[tokio::main] | |
async fn main() -> Result<()> { | |
let start = Instant::now(); | |
// Create database connection | |
let db_path = std::env::current_dir()?.join("imdb.surrealdb"); | |
// if "imdb.surrealdb" exists, delete it | |
if std::path::Path::new(&db_path).exists() { | |
std::fs::remove_dir_all(&db_path)?; | |
} | |
let db = Surreal::new::<RocksDb>(db_path.clone()).await?; | |
db.use_ns("imdb").use_db("names").await?; | |
let csv_path = std::env::current_dir()?.join("../name.basics.tsv"); | |
println!("CSV path: {:?}", csv_path); | |
// Count total rows first | |
let row_count = ReaderBuilder::new() | |
.delimiter(b'\t') | |
.from_path(&csv_path)? | |
.into_records() | |
.count(); | |
println!("Total rows to process: {}", row_count); | |
// Create reader again for actual processing | |
let mut rdr = ReaderBuilder::new().delimiter(b'\t').from_path(csv_path)?; | |
let mut count = 0; | |
println!("Processing records..."); | |
let tx = db.transaction().await?; | |
// Process each record | |
for result in rdr.records() { | |
let record = result?; | |
let birth_year = match &record[2] { | |
"\\N" => None, | |
year => Some(year.parse::<i32>()?), | |
}; | |
let death_year = match &record[3] { | |
"\\N" => None, | |
year => Some(year.parse::<i32>()?), | |
}; | |
// Create record in SurrealDB | |
let name = NameBasic { | |
nconst: record[0].to_string(), | |
primary_name: record[1].to_string(), | |
birth_year, | |
death_year, | |
primary_profession: record[4].to_string(), | |
known_for_titles: record[5].to_string(), | |
}; | |
// Insert using the nconst as the record ID | |
(&tx) | |
.create::<Option<NameBasic>>("name_basic") | |
.content(name) | |
.await?; | |
count += 1; | |
if count % 100_000 == 0 { | |
println!("Processed {} NameBasics", count); | |
} | |
} | |
println!("Starting database insert..."); | |
tx.commit().await?; | |
let duration = start.elapsed(); | |
println!("Successfully imported {} records in {:?}", count, duration); | |
Ok(()) | |
} |
@Dhghomon With this syntax how can I insert a few millions of rows of data in a transaction?
Hey @HuakunShen, while the client side transactions have not been implemented yet, you quickly create mock data like this in SurrealQL
CREATE |name_basics:3000000| CONTENT {
nconst: rand::string(10),
primary_name: rand::string(10),
birth_year: rand::int(1970, 2024),
death_year: rand::int(1970, 2024),
primary_profession: rand::string(10),
known_for_titles: rand::string(15),
};
To get more info on inserting data you can look at this lesson in the SurrealDB fundamentals course: https://surrealdb.com/learn/fundamentals/schemaless/inserting-data
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Interesting that there is technically a .transaction() method, I guess it's a placeholder for the time being because it is hidden in the docs and there is a note saying that it isn't supported yet. At the moment a manual .query("BEGIN").query("CREATE person").query("COMMIT") is the way to use a manual transaction.