Skip to content

Instantly share code, notes, and snippets.

@dalf
Created September 8, 2021 09:07
Show Gist options
  • Save dalf/ef661c24da553b3db41bee6a8f24da9f to your computer and use it in GitHub Desktop.
Save dalf/ef661c24da553b3db41bee6a8f24da9f to your computer and use it in GitHub Desktop.
fst import
[package]
name = "rust_test"
version = "0.1.0"
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
fst = "0.4"
rusqlite = "0.25"
flate2 = "1.0.20"
tar = "0.4.37"
// src/main.rs
use std::{fs::File};
use std::io::{BufRead, BufReader, BufWriter};
use fst::{MapBuilder, SetBuilder};
use rusqlite;
use flate2::read::GzDecoder;
use tar::Archive;
#[derive(Debug)]
struct GreenResult {
url: String,
}
fn import_green() -> Result<(), Box<dyn std::error::Error>> {
let conn = rusqlite::Connection::open("green_urls_2021-02-16.db")?;
let mut hosts: Vec<String> = Vec::with_capacity(1_000_000);
let mut totalsize = 0;
let sql = "
SELECT
TRIM(url) as url
FROM greendomain
WHERE green=1
GROUP BY TRIM(url)
ORDER BY TRIM(url)
";
let mut stmt = conn.prepare(sql)?;
let rows = stmt.query([]);
while let Some(row) = rows.next()? {
let host: String = row.get(0);
let reversed_host = host.chars().rev().collect::<String>();
totalsize += reversed_host.len() + 1;
hosts.push(reversed_host.trim().to_owned());
}
unsafe {
println!("Entry count {}", totalsize);
}
hosts.sort();
let file_handle = File::create("urls.fst")?;
let buffered_writer = BufWriter::new(file_handle);
let mut set_builder = SetBuilder::new(buffered_writer)?;
for host in hosts {
set_builder.insert(host).unwrap();
}
set_builder.finish()?;
Ok(())
}
/*
fn read_tgz(file_name: &str) -> Result<(), std::io::Error> {
let tar_gz = File::open(file_name)?;
let tar = GzDecoder::new(tar_gz);
let mut archive = Archive::new(tar);
let z = archive.entries()?;
for e in z {
let ee = e?;
let path = ee.header().path()?;
let file_name = path.file_name()?;
if file_name.to_owned() == "smarter_encryption.txt" {
break;
}
}
Ok(())
}
*/
fn import_smarter_encryption() -> Result<(), Box<dyn std::error::Error>> {
unsafe {
println!("Read");
}
let file = File::open("smarter_encryption_latest/smarter_encryption.txt")?;
let reader = BufReader::new(file);
let mut v: Vec<String> = Vec::with_capacity(34_755_302);
for line in reader.lines() {
let reversed_line = line?.chars().rev().collect::<String>();
v.push(reversed_line.trim().to_owned());
}
let v_len = v.len();
unsafe {
println!("Sort {} hosts", v_len);
}
v.sort();
unsafe {
println!("Write");
}
let file_handle = File::create("smarter_encryption_latest/smarter_encryption.fst")?;
let buffered_writer = BufWriter::new(file_handle);
let mut set_builder = SetBuilder::new(buffered_writer)?;
let mut count: i64 = 0;
for value in v {
set_builder.insert(value).unwrap();
count += 1;
if count % 500_000 == 0 {
unsafe {
println!("{}", count);
}
}
}
set_builder.finish()?;
Ok(())
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
import_green()?;
// import_smarter_encryption()?;
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment