Created
September 8, 2021 09:07
-
-
Save dalf/ef661c24da553b3db41bee6a8f24da9f to your computer and use it in GitHub Desktop.
fst import
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "rust_test" | |
version = "0.1.0" | |
edition = "2018" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
fst = "0.4" | |
rusqlite = "0.25" | |
flate2 = "1.0.20" | |
tar = "0.4.37" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// src/main.rs | |
use std::{fs::File}; | |
use std::io::{BufRead, BufReader, BufWriter}; | |
use fst::{MapBuilder, SetBuilder}; | |
use rusqlite; | |
use flate2::read::GzDecoder; | |
use tar::Archive; | |
#[derive(Debug)] | |
struct GreenResult { | |
url: String, | |
} | |
fn import_green() -> Result<(), Box<dyn std::error::Error>> { | |
let conn = rusqlite::Connection::open("green_urls_2021-02-16.db")?; | |
let mut hosts: Vec<String> = Vec::with_capacity(1_000_000); | |
let mut totalsize = 0; | |
let sql = " | |
SELECT | |
TRIM(url) as url | |
FROM greendomain | |
WHERE green=1 | |
GROUP BY TRIM(url) | |
ORDER BY TRIM(url) | |
"; | |
let mut stmt = conn.prepare(sql)?; | |
let rows = stmt.query([]); | |
while let Some(row) = rows.next()? { | |
let host: String = row.get(0); | |
let reversed_host = host.chars().rev().collect::<String>(); | |
totalsize += reversed_host.len() + 1; | |
hosts.push(reversed_host.trim().to_owned()); | |
} | |
unsafe { | |
println!("Entry count {}", totalsize); | |
} | |
hosts.sort(); | |
let file_handle = File::create("urls.fst")?; | |
let buffered_writer = BufWriter::new(file_handle); | |
let mut set_builder = SetBuilder::new(buffered_writer)?; | |
for host in hosts { | |
set_builder.insert(host).unwrap(); | |
} | |
set_builder.finish()?; | |
Ok(()) | |
} | |
/* | |
fn read_tgz(file_name: &str) -> Result<(), std::io::Error> { | |
let tar_gz = File::open(file_name)?; | |
let tar = GzDecoder::new(tar_gz); | |
let mut archive = Archive::new(tar); | |
let z = archive.entries()?; | |
for e in z { | |
let ee = e?; | |
let path = ee.header().path()?; | |
let file_name = path.file_name()?; | |
if file_name.to_owned() == "smarter_encryption.txt" { | |
break; | |
} | |
} | |
Ok(()) | |
} | |
*/ | |
fn import_smarter_encryption() -> Result<(), Box<dyn std::error::Error>> { | |
unsafe { | |
println!("Read"); | |
} | |
let file = File::open("smarter_encryption_latest/smarter_encryption.txt")?; | |
let reader = BufReader::new(file); | |
let mut v: Vec<String> = Vec::with_capacity(34_755_302); | |
for line in reader.lines() { | |
let reversed_line = line?.chars().rev().collect::<String>(); | |
v.push(reversed_line.trim().to_owned()); | |
} | |
let v_len = v.len(); | |
unsafe { | |
println!("Sort {} hosts", v_len); | |
} | |
v.sort(); | |
unsafe { | |
println!("Write"); | |
} | |
let file_handle = File::create("smarter_encryption_latest/smarter_encryption.fst")?; | |
let buffered_writer = BufWriter::new(file_handle); | |
let mut set_builder = SetBuilder::new(buffered_writer)?; | |
let mut count: i64 = 0; | |
for value in v { | |
set_builder.insert(value).unwrap(); | |
count += 1; | |
if count % 500_000 == 0 { | |
unsafe { | |
println!("{}", count); | |
} | |
} | |
} | |
set_builder.finish()?; | |
Ok(()) | |
} | |
fn main() -> Result<(), Box<dyn std::error::Error>> { | |
import_green()?; | |
// import_smarter_encryption()?; | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment