Skip to content

Instantly share code, notes, and snippets.

@RealNeGate
Last active November 27, 2023 11:41
Show Gist options
  • Save RealNeGate/a7b2b81241d4057d3f4ad9d20a633ecb to your computer and use it in GitHub Desktop.
Save RealNeGate/a7b2b81241d4057d3f4ad9d20a633ecb to your computer and use it in GitHub Desktop.
import java.nio.file.*;
import java.io.*;
import java.util.*;
import java.nio.charset.StandardCharsets;
class csv {
// i don't want all my fucking int array elements boxed.
static class IntArray {
int cnt;
int[] data;
IntArray(int init_cap) { data = new int[init_cap]; cnt = 0; }
void reset() { cnt = 0; }
int get(int i) { return data[i]; }
void put(int v) {
// grow array
if (cnt == data.length) {
int[] new_data = new int[cnt * 2];
System.arraycopy(data, 0, new_data, 0, cnt);
data = new_data;
}
data[cnt++] = v;
}
}
static class CSV {
// points into data to show where rows start
IntArray rows;
// represent slices:
// data[parsed[i*2 + 0] .. parsed[i*2 + 1])
IntArray parsed;
// Actual file bytes
byte[] data;
CSV() {
rows = new IntArray(2*1024*1024);
parsed = new IntArray(2*1024*1024);
}
void parse(byte data[]) {
this.data = data;
rows.reset();
parsed.reset();
// for each line
int i = 0;
while (i < data.length) {
rows.put(parsed.cnt);
parsed.put(i);
do {
byte c = data[i++];
if (c == '\r' || c == '\n') {
// line break (CRLF included)
if (i < data.length && c + data[i] == '\r' + '\n') {
i += 1;
}
break;
} else if (c == ',') {
parsed.put(i);
} else if (c == '"') {
while (data[i++] != '"') {}
}
} while (i < data.length);
}
// write out EOF
rows.put(parsed.cnt);
parsed.put(data.length);
}
int row_len(int i) { return rows.get(i + 1) - rows.get(i); }
String entry(int i, int j) {
int k = rows.get(i);
int start = parsed.get(k+j);
int end = parsed.get(k+j+1) - 1;
return new String(data, start, end - start, StandardCharsets.UTF_8);
}
};
public static void main(String[] args) throws IOException {
if (args[0].equals("-single")) {
// single test
byte[] data = Files.readAllBytes(Paths.get(args[1]));
var csv = new CSV();
csv.parse(data);
System.out.println(csv.entry(csv.rows.cnt - 2, 2));
} else {
byte[] data = Files.readAllBytes(Paths.get(args[0]));
var csv = new CSV();
long avg = 0;
for (int i = 0; i < 10; i++) {
long start = System.nanoTime();
csv.parse(data);
long elapsed = System.nanoTime() - start;
avg += elapsed / 10;
double sec = elapsed / 1000000000.0;
double bw = (double) data.length / sec;
bw /= 1048576.0f;
System.out.printf("Run %d: %f MB/s (avg %f seconds)\n", 1+i, bw, sec, csv.entry(csv.rows.cnt - 2, 2));
}
double sec = avg / 1000000000.0;
double bw = ((double) data.length / sec);
bw /= 1048576.0f;
System.out.printf("Average nanos: %f MB/s (avg %f seconds)", bw, sec);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment