Last active
January 2, 2024 10:13
-
-
Save PrivateRookie/3cc2330f9bbeb1115efb4a49f0627298 to your computer and use it in GitHub Desktop.
tsv 解析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![allow(dead_code)] | |
use std::net::IpAddr; | |
use criterion::{black_box, criterion_group, criterion_main, Criterion}; | |
const OUTPUT: &str = include_str!("net.tsv"); | |
fn criterion_benchmark(c: &mut Criterion) { | |
let re = regex::Regex::new(r"(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)").unwrap(); | |
let mut r1 = Vec::with_capacity(400); | |
c.bench_function("regex_owned", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r1.clear(); | |
regex_owned(black_box(OUTPUT), &re, &mut r1); | |
}) | |
}); | |
let mut r2 = Vec::with_capacity(400); | |
c.bench_function("regex_borrow", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
regex_borrow(black_box(OUTPUT), &re, &mut r2); | |
}) | |
}); | |
let re = regex::bytes::Regex::new(r"(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)").unwrap(); | |
c.bench_function("regex_ascii", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
regex_ascii(black_box(OUTPUT), &re, &mut r2); | |
}) | |
}); | |
c.bench_function("split", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
split(black_box(OUTPUT), &mut r2); | |
}) | |
}); | |
c.bench_function("split_whitespace", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
split_whitespace(black_box(OUTPUT), &mut r2); | |
}) | |
}); | |
c.bench_function("split_ascii_whitespace", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
split_ascii_whitespace(black_box(OUTPUT), &mut r2); | |
}) | |
}); | |
let mut r3 = Vec::with_capacity(400); | |
c.bench_function("split_typed", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r3.clear(); | |
split_typed(black_box(OUTPUT), &mut r3); | |
}) | |
}); | |
c.bench_function("split_reduce_alloc", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
split_reduce_alloc(black_box(OUTPUT), &mut r2); | |
}) | |
}); | |
c.bench_function("manual", |b| { | |
b.iter(|| { | |
// 重置输出 vector | |
r2.clear(); | |
manual(black_box(OUTPUT), &mut r2); | |
}) | |
}); | |
} | |
criterion_group!(benches, criterion_benchmark); | |
criterion_main!(benches); | |
#[derive(Debug, PartialEq, Eq)] | |
#[allow(non_camel_case_types)] | |
enum State { | |
START, | |
L_ADDR_C, | |
L_ADDR, | |
L_PORT_C, | |
L_PORT, | |
R_ADDR_C, | |
R_ADDR, | |
R_PORT_C, | |
R_PORT, | |
STATE_C, | |
STATE, | |
PID_C, | |
ABORT, | |
} | |
pub enum BChar { | |
NL = 0, | |
WS = 3, | |
Char = 2, | |
} | |
impl<'a> From<&'a u8> for BChar { | |
fn from(value: &'a u8) -> Self { | |
match value { | |
b'\n' => BChar::NL, | |
b'\t' | b'\x0C' | b'\r' | b' ' => BChar::WS, | |
_ => BChar::Char, | |
} | |
} | |
} | |
fn manual<'a>(content: &'a str, result: &mut Vec<Record<'a>>) { | |
use crate::State::*; | |
fn cast(data: &[u8], start: usize, end: usize) -> &str { | |
unsafe { std::mem::transmute(&data[start..end]) } | |
} | |
let data = content.as_bytes(); | |
let mut start = 0; | |
let mut local_addr = cast(data, 0, 0); | |
let mut local_port = 0; | |
let mut remote_addr = cast(data, 0, 0); | |
let mut remote_port = 0; | |
let mut c_state = cast(data, 0, 0); | |
let state = data | |
.iter() | |
.map(BChar::from) | |
.enumerate() | |
.fold(START, |state, (idx, b)| match (&state, b) { | |
(_, BChar::NL) => START, | |
(START, BChar::WS) => state, | |
(START, BChar::Char) => { | |
start = idx; | |
L_ADDR_C | |
} | |
(L_ADDR_C, BChar::WS) => { | |
local_addr = cast(data, start, idx); | |
L_ADDR | |
} | |
(L_ADDR_C, BChar::Char) => state, | |
(L_ADDR, BChar::WS) => state, | |
(L_ADDR, BChar::Char) => { | |
start = idx; | |
L_PORT_C | |
} | |
(L_PORT_C, BChar::WS) => cast(data, start, idx) | |
.parse() | |
.map(|port: u16| { | |
local_port = port; | |
L_PORT | |
}) | |
.unwrap_or(ABORT), | |
(L_PORT_C, BChar::Char) => state, | |
(L_PORT, BChar::WS) => state, | |
(L_PORT, BChar::Char) => { | |
start = idx; | |
R_ADDR_C | |
} | |
(R_ADDR_C, BChar::WS) => { | |
remote_addr = cast(data, start, idx); | |
R_ADDR | |
} | |
(R_ADDR_C, BChar::Char) => state, | |
(R_ADDR, BChar::WS) => state, | |
(R_ADDR, BChar::Char) => { | |
start = idx; | |
R_PORT_C | |
} | |
(R_PORT_C, BChar::WS) => cast(data, start, idx) | |
.parse() | |
.map(|port: u16| { | |
remote_port = port; | |
R_PORT | |
}) | |
.unwrap_or(ABORT), | |
(R_PORT_C, BChar::Char) => state, | |
(R_PORT, BChar::WS) => state, | |
(R_PORT, BChar::Char) => { | |
start = idx; | |
STATE_C | |
} | |
(STATE_C, BChar::WS) => { | |
c_state = cast(data, start, idx); | |
STATE | |
} | |
(STATE_C, BChar::Char) => state, | |
(STATE, BChar::WS) => state, | |
(STATE, BChar::Char) => { | |
start = idx; | |
PID_C | |
} | |
(PID_C, BChar::WS) => { | |
match cast(data, start, idx).parse() { | |
Ok(pid) => { | |
result.push(Record { | |
local_addr, | |
remote_addr, | |
state: c_state, | |
local_port, | |
remote_port, | |
pid, | |
}); | |
} | |
Err(_) => {} | |
} | |
ABORT | |
} | |
(PID_C, BChar::Char) => state, | |
(ABORT, BChar::WS | BChar::Char) => state, | |
}); | |
assert_eq!(state, START); | |
assert_eq!(result.len(), 377); | |
} | |
struct RecordTyped<'a> { | |
local_addr: IpAddr, | |
local_port: u16, | |
remote_addr: IpAddr, | |
remote_port: u16, | |
state: &'a str, | |
pid: u64, | |
} | |
fn split_reduce_alloc<'a>(input: &'a str, result: &mut Vec<Record<'a>>) { | |
input | |
.lines() | |
.filter_map(|line| { | |
let mut iter = line.split_ascii_whitespace(); | |
Some(Record { | |
local_addr: iter.next()?, | |
local_port: iter.next()?.parse().ok()?, | |
remote_addr: iter.next()?, | |
remote_port: iter.next()?.parse().ok()?, | |
state: iter.next()?, | |
pid: iter.next()?.parse().ok()?, | |
}) | |
}) | |
.for_each(|item| result.push(item)); | |
assert_eq!(result.len(), 377); | |
} | |
fn split_typed<'a>(input: &'a str, result: &mut Vec<RecordTyped<'a>>) { | |
input | |
.lines() | |
.filter_map(|line| { | |
let mut iter = line.split_ascii_whitespace(); | |
let local_addr = iter.next()?.parse().ok()?; | |
let local_port: u16 = iter.next()?.parse().ok()?; | |
let remote_addr = iter.next()?.parse().ok()?; | |
let remote_port: u16 = iter.next()?.parse().ok()?; | |
let state = iter.next()?; | |
let pid: u64 = iter.next()?.parse().ok()?; | |
Some(RecordTyped { | |
local_addr, | |
local_port, | |
remote_addr, | |
remote_port, | |
state, | |
pid, | |
}) | |
}) | |
.for_each(|item| result.push(item)); | |
assert_eq!(result.len(), 377); | |
} | |
fn split_ascii_whitespace<'a>(input: &'a str, result: &mut Vec<Record<'a>>) { | |
input | |
.lines() | |
.filter_map(|line| { | |
let mut iter = line.split_ascii_whitespace(); | |
let local_addr = iter.next()?; | |
let local_port: u16 = iter.next()?.parse().ok()?; | |
let remote_addr = iter.next()?; | |
let remote_port: u16 = iter.next()?.parse().ok()?; | |
let state = iter.next()?; | |
let pid: u64 = iter.next()?.parse().ok()?; | |
Some(Record { | |
local_addr, | |
local_port, | |
remote_addr, | |
remote_port, | |
state, | |
pid, | |
}) | |
}) | |
.for_each(|item| result.push(item)); | |
assert_eq!(result.len(), 377); | |
} | |
fn split_whitespace<'a>(input: &'a str, result: &mut Vec<Record<'a>>) { | |
input | |
.lines() | |
.filter_map(|line| { | |
let mut iter = line.split_whitespace(); | |
let local_addr = iter.next()?; | |
let local_port: u16 = iter.next()?.parse().ok()?; | |
let remote_addr = iter.next()?; | |
let remote_port: u16 = iter.next()?.parse().ok()?; | |
let state = iter.next()?; | |
let pid: u64 = iter.next()?.parse().ok()?; | |
Some(Record { | |
local_addr, | |
local_port, | |
remote_addr, | |
remote_port, | |
state, | |
pid, | |
}) | |
}) | |
.for_each(|item| result.push(item)); | |
assert_eq!(result.len(), 377); | |
} | |
fn split<'a>(input: &'a str, result: &mut Vec<Record<'a>>) { | |
input | |
.lines() | |
.filter_map(|line| { | |
let mut iter = line.split([' ', '\t', '\r']).filter(|c| !c.is_empty()); | |
let local_addr = iter.next()?; | |
let local_port: u16 = iter.next()?.parse().ok()?; | |
let remote_addr = iter.next()?; | |
let remote_port: u16 = iter.next()?.parse().ok()?; | |
let state = iter.next()?; | |
let pid: u64 = iter.next()?.parse().ok()?; | |
Some(Record { | |
local_addr, | |
local_port, | |
remote_addr, | |
remote_port, | |
state, | |
pid, | |
}) | |
}) | |
.for_each(|item| result.push(item)); | |
assert_eq!(result.len(), 377); | |
} | |
fn cast(data: &[u8]) -> &str { | |
unsafe { std::mem::transmute(data) } | |
} | |
fn regex_ascii<'a>(input: &'a str, re: ®ex::bytes::Regex, result: &mut Vec<Record<'a>>) { | |
input.lines().for_each(|line| { | |
if let Some(item) = re.captures(line.as_bytes()).and_then(|captures| { | |
let (_, [local_addr, local_port, remote_addr, remote_port, state, pid]) = | |
captures.extract(); | |
let ret = Record { | |
local_addr: cast(local_addr), | |
local_port: cast(local_port).parse().ok()?, | |
remote_addr: cast(remote_addr), | |
remote_port: cast(remote_port).parse().ok()?, | |
state: cast(state), | |
pid: cast(pid).parse().ok()?, | |
}; | |
Some(ret) | |
}) { | |
result.push(item); | |
} | |
}); | |
assert_eq!(result.len(), 377); | |
} | |
struct Record<'a> { | |
local_addr: &'a str, | |
local_port: u16, | |
remote_addr: &'a str, | |
remote_port: u16, | |
state: &'a str, | |
pid: u64, | |
} | |
fn regex_borrow<'a>(input: &'a str, re: ®ex::Regex, result: &mut Vec<Record<'a>>) { | |
input.lines().for_each(|line| { | |
if let Some(item) = re.captures(line).and_then(|captures| { | |
let (_, [local_addr, local_port, remote_addr, remote_port, state, pid]) = | |
captures.extract(); | |
let ret = Record { | |
local_addr, | |
local_port: local_port.parse().ok()?, | |
remote_addr, | |
remote_port: remote_port.parse().ok()?, | |
state, | |
pid: pid.parse().ok()?, | |
}; | |
Some(ret) | |
}) { | |
result.push(item); | |
} | |
}); | |
assert_eq!(result.len(), 377); | |
} | |
struct OwnedRecord { | |
local_addr: String, | |
local_port: u16, | |
remote_addr: String, | |
remote_port: u16, | |
state: String, | |
pid: u64, | |
} | |
fn regex_owned(input: &str, re: ®ex::Regex, result: &mut Vec<OwnedRecord>) { | |
input.lines().for_each(|line| { | |
if let Some(item) = re.captures(line).and_then(|captures| { | |
let (_, [local_addr, local_port, remote_addr, remote_port, state, pid]) = | |
captures.extract(); | |
let ret = OwnedRecord { | |
local_addr: local_addr.to_string(), | |
local_port: local_port.parse().ok()?, | |
remote_addr: remote_addr.to_string(), | |
remote_port: remote_port.parse().ok()?, | |
state: state.to_string(), | |
pid: pid.parse().ok()?, | |
}; | |
Some(ret) | |
}) { | |
result.push(item); | |
} | |
}); | |
assert_eq!(result.len(), 377); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment