Skip to content

Instantly share code, notes, and snippets.

@PrivateRookie
Last active January 2, 2024 10:13
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PrivateRookie/3cc2330f9bbeb1115efb4a49f0627298 to your computer and use it in GitHub Desktop.
Save PrivateRookie/3cc2330f9bbeb1115efb4a49f0627298 to your computer and use it in GitHub Desktop.
tsv 解析
#![allow(dead_code)]
use std::net::IpAddr;
use criterion::{black_box, criterion_group, criterion_main, Criterion};
const OUTPUT: &str = include_str!("net.tsv");
fn criterion_benchmark(c: &mut Criterion) {
let re = regex::Regex::new(r"(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)").unwrap();
let mut r1 = Vec::with_capacity(400);
c.bench_function("regex_owned", |b| {
b.iter(|| {
// 重置输出 vector
r1.clear();
regex_owned(black_box(OUTPUT), &re, &mut r1);
})
});
let mut r2 = Vec::with_capacity(400);
c.bench_function("regex_borrow", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
regex_borrow(black_box(OUTPUT), &re, &mut r2);
})
});
let re = regex::bytes::Regex::new(r"(\S+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)").unwrap();
c.bench_function("regex_ascii", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
regex_ascii(black_box(OUTPUT), &re, &mut r2);
})
});
c.bench_function("split", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
split(black_box(OUTPUT), &mut r2);
})
});
c.bench_function("split_whitespace", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
split_whitespace(black_box(OUTPUT), &mut r2);
})
});
c.bench_function("split_ascii_whitespace", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
split_ascii_whitespace(black_box(OUTPUT), &mut r2);
})
});
let mut r3 = Vec::with_capacity(400);
c.bench_function("split_typed", |b| {
b.iter(|| {
// 重置输出 vector
r3.clear();
split_typed(black_box(OUTPUT), &mut r3);
})
});
c.bench_function("split_reduce_alloc", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
split_reduce_alloc(black_box(OUTPUT), &mut r2);
})
});
c.bench_function("manual", |b| {
b.iter(|| {
// 重置输出 vector
r2.clear();
manual(black_box(OUTPUT), &mut r2);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
#[derive(Debug, PartialEq, Eq)]
#[allow(non_camel_case_types)]
enum State {
START,
L_ADDR_C,
L_ADDR,
L_PORT_C,
L_PORT,
R_ADDR_C,
R_ADDR,
R_PORT_C,
R_PORT,
STATE_C,
STATE,
PID_C,
ABORT,
}
pub enum BChar {
NL = 0,
WS = 3,
Char = 2,
}
impl<'a> From<&'a u8> for BChar {
fn from(value: &'a u8) -> Self {
match value {
b'\n' => BChar::NL,
b'\t' | b'\x0C' | b'\r' | b' ' => BChar::WS,
_ => BChar::Char,
}
}
}
fn manual<'a>(content: &'a str, result: &mut Vec<Record<'a>>) {
use crate::State::*;
fn cast(data: &[u8], start: usize, end: usize) -> &str {
unsafe { std::mem::transmute(&data[start..end]) }
}
let data = content.as_bytes();
let mut start = 0;
let mut local_addr = cast(data, 0, 0);
let mut local_port = 0;
let mut remote_addr = cast(data, 0, 0);
let mut remote_port = 0;
let mut c_state = cast(data, 0, 0);
let state = data
.iter()
.map(BChar::from)
.enumerate()
.fold(START, |state, (idx, b)| match (&state, b) {
(_, BChar::NL) => START,
(START, BChar::WS) => state,
(START, BChar::Char) => {
start = idx;
L_ADDR_C
}
(L_ADDR_C, BChar::WS) => {
local_addr = cast(data, start, idx);
L_ADDR
}
(L_ADDR_C, BChar::Char) => state,
(L_ADDR, BChar::WS) => state,
(L_ADDR, BChar::Char) => {
start = idx;
L_PORT_C
}
(L_PORT_C, BChar::WS) => cast(data, start, idx)
.parse()
.map(|port: u16| {
local_port = port;
L_PORT
})
.unwrap_or(ABORT),
(L_PORT_C, BChar::Char) => state,
(L_PORT, BChar::WS) => state,
(L_PORT, BChar::Char) => {
start = idx;
R_ADDR_C
}
(R_ADDR_C, BChar::WS) => {
remote_addr = cast(data, start, idx);
R_ADDR
}
(R_ADDR_C, BChar::Char) => state,
(R_ADDR, BChar::WS) => state,
(R_ADDR, BChar::Char) => {
start = idx;
R_PORT_C
}
(R_PORT_C, BChar::WS) => cast(data, start, idx)
.parse()
.map(|port: u16| {
remote_port = port;
R_PORT
})
.unwrap_or(ABORT),
(R_PORT_C, BChar::Char) => state,
(R_PORT, BChar::WS) => state,
(R_PORT, BChar::Char) => {
start = idx;
STATE_C
}
(STATE_C, BChar::WS) => {
c_state = cast(data, start, idx);
STATE
}
(STATE_C, BChar::Char) => state,
(STATE, BChar::WS) => state,
(STATE, BChar::Char) => {
start = idx;
PID_C
}
(PID_C, BChar::WS) => {
match cast(data, start, idx).parse() {
Ok(pid) => {
result.push(Record {
local_addr,
remote_addr,
state: c_state,
local_port,
remote_port,
pid,
});
}
Err(_) => {}
}
ABORT
}
(PID_C, BChar::Char) => state,
(ABORT, BChar::WS | BChar::Char) => state,
});
assert_eq!(state, START);
assert_eq!(result.len(), 377);
}
struct RecordTyped<'a> {
local_addr: IpAddr,
local_port: u16,
remote_addr: IpAddr,
remote_port: u16,
state: &'a str,
pid: u64,
}
fn split_reduce_alloc<'a>(input: &'a str, result: &mut Vec<Record<'a>>) {
input
.lines()
.filter_map(|line| {
let mut iter = line.split_ascii_whitespace();
Some(Record {
local_addr: iter.next()?,
local_port: iter.next()?.parse().ok()?,
remote_addr: iter.next()?,
remote_port: iter.next()?.parse().ok()?,
state: iter.next()?,
pid: iter.next()?.parse().ok()?,
})
})
.for_each(|item| result.push(item));
assert_eq!(result.len(), 377);
}
fn split_typed<'a>(input: &'a str, result: &mut Vec<RecordTyped<'a>>) {
input
.lines()
.filter_map(|line| {
let mut iter = line.split_ascii_whitespace();
let local_addr = iter.next()?.parse().ok()?;
let local_port: u16 = iter.next()?.parse().ok()?;
let remote_addr = iter.next()?.parse().ok()?;
let remote_port: u16 = iter.next()?.parse().ok()?;
let state = iter.next()?;
let pid: u64 = iter.next()?.parse().ok()?;
Some(RecordTyped {
local_addr,
local_port,
remote_addr,
remote_port,
state,
pid,
})
})
.for_each(|item| result.push(item));
assert_eq!(result.len(), 377);
}
fn split_ascii_whitespace<'a>(input: &'a str, result: &mut Vec<Record<'a>>) {
input
.lines()
.filter_map(|line| {
let mut iter = line.split_ascii_whitespace();
let local_addr = iter.next()?;
let local_port: u16 = iter.next()?.parse().ok()?;
let remote_addr = iter.next()?;
let remote_port: u16 = iter.next()?.parse().ok()?;
let state = iter.next()?;
let pid: u64 = iter.next()?.parse().ok()?;
Some(Record {
local_addr,
local_port,
remote_addr,
remote_port,
state,
pid,
})
})
.for_each(|item| result.push(item));
assert_eq!(result.len(), 377);
}
fn split_whitespace<'a>(input: &'a str, result: &mut Vec<Record<'a>>) {
input
.lines()
.filter_map(|line| {
let mut iter = line.split_whitespace();
let local_addr = iter.next()?;
let local_port: u16 = iter.next()?.parse().ok()?;
let remote_addr = iter.next()?;
let remote_port: u16 = iter.next()?.parse().ok()?;
let state = iter.next()?;
let pid: u64 = iter.next()?.parse().ok()?;
Some(Record {
local_addr,
local_port,
remote_addr,
remote_port,
state,
pid,
})
})
.for_each(|item| result.push(item));
assert_eq!(result.len(), 377);
}
fn split<'a>(input: &'a str, result: &mut Vec<Record<'a>>) {
input
.lines()
.filter_map(|line| {
let mut iter = line.split([' ', '\t', '\r']).filter(|c| !c.is_empty());
let local_addr = iter.next()?;
let local_port: u16 = iter.next()?.parse().ok()?;
let remote_addr = iter.next()?;
let remote_port: u16 = iter.next()?.parse().ok()?;
let state = iter.next()?;
let pid: u64 = iter.next()?.parse().ok()?;
Some(Record {
local_addr,
local_port,
remote_addr,
remote_port,
state,
pid,
})
})
.for_each(|item| result.push(item));
assert_eq!(result.len(), 377);
}
fn cast(data: &[u8]) -> &str {
unsafe { std::mem::transmute(data) }
}
fn regex_ascii<'a>(input: &'a str, re: &regex::bytes::Regex, result: &mut Vec<Record<'a>>) {
input.lines().for_each(|line| {
if let Some(item) = re.captures(line.as_bytes()).and_then(|captures| {
let (_, [local_addr, local_port, remote_addr, remote_port, state, pid]) =
captures.extract();
let ret = Record {
local_addr: cast(local_addr),
local_port: cast(local_port).parse().ok()?,
remote_addr: cast(remote_addr),
remote_port: cast(remote_port).parse().ok()?,
state: cast(state),
pid: cast(pid).parse().ok()?,
};
Some(ret)
}) {
result.push(item);
}
});
assert_eq!(result.len(), 377);
}
struct Record<'a> {
local_addr: &'a str,
local_port: u16,
remote_addr: &'a str,
remote_port: u16,
state: &'a str,
pid: u64,
}
fn regex_borrow<'a>(input: &'a str, re: &regex::Regex, result: &mut Vec<Record<'a>>) {
input.lines().for_each(|line| {
if let Some(item) = re.captures(line).and_then(|captures| {
let (_, [local_addr, local_port, remote_addr, remote_port, state, pid]) =
captures.extract();
let ret = Record {
local_addr,
local_port: local_port.parse().ok()?,
remote_addr,
remote_port: remote_port.parse().ok()?,
state,
pid: pid.parse().ok()?,
};
Some(ret)
}) {
result.push(item);
}
});
assert_eq!(result.len(), 377);
}
struct OwnedRecord {
local_addr: String,
local_port: u16,
remote_addr: String,
remote_port: u16,
state: String,
pid: u64,
}
fn regex_owned(input: &str, re: &regex::Regex, result: &mut Vec<OwnedRecord>) {
input.lines().for_each(|line| {
if let Some(item) = re.captures(line).and_then(|captures| {
let (_, [local_addr, local_port, remote_addr, remote_port, state, pid]) =
captures.extract();
let ret = OwnedRecord {
local_addr: local_addr.to_string(),
local_port: local_port.parse().ok()?,
remote_addr: remote_addr.to_string(),
remote_port: remote_port.parse().ok()?,
state: state.to_string(),
pid: pid.parse().ok()?,
};
Some(ret)
}) {
result.push(item);
}
});
assert_eq!(result.len(), 377);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment