Skip to content

Instantly share code, notes, and snippets.

@oschonrock
Last active January 5, 2021 19:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save oschonrock/67fc870ba067ebf0f369897a9d52c2dd to your computer and use it in GitHub Desktop.
Save oschonrock/67fc870ba067ebf0f369897a9d52c2dd to your computer and use it in GitHub Desktop.
High speed parsing of floats in CSV format - C++
// refers to this question of SO:
// https://stackoverflow.com/questions/17465061/how-to-parse-space-separated-floats-in-c-quickly/59013147
#include <iomanip>
#include <iostream>
// for mmap:
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
class MemoryMappedFile {
public:
MemoryMappedFile(const char* filename) {
int fd = open(filename, O_RDONLY);
if (fd == -1) throw std::logic_error("MemoryMappedFile: couldn't open file.");
// obtain file size
struct stat sb;
if (fstat(fd, &sb) == -1) throw std::logic_error("MemoryMappedFile: cannot stat file size");
m_filesize = sb.st_size;
m_map = static_cast<const char*>(mmap(NULL, m_filesize, PROT_READ, MAP_PRIVATE, fd, 0u));
if (m_map == MAP_FAILED) throw std::logic_error("MemoryMappedFile: cannot map file");
}
~MemoryMappedFile() {
if (munmap(static_cast<void*>(const_cast<char*>(m_map)), m_filesize) == -1)
std::cerr << "Warnng: MemoryMappedFile: error in destructor during `munmap()`\n";
}
const char* start() const { return m_map; }
const char* end() const { return m_map + m_filesize; }
private:
size_t m_filesize = 0;
const char* m_map = nullptr;
};
// high speed str -> double parser
double pow10(int n) {
double ret = 1.0;
double r = 10.0;
if (n < 0) {
n = -n;
r = 0.1;
}
while (n) {
if (n & 1) {
ret *= r;
}
r *= r;
n >>= 1;
}
return ret;
}
double crack_atof(const char* start, const char* const end) {
if (!start || !end || end <= start) {
return 0;
}
int sign = 1;
double int_part = 0.0;
double frac_part = 0.0;
bool has_frac = false;
bool has_exp = false;
// +/- sign
if (*start == '-') {
++start;
sign = -1;
} else if (*start == '+') {
++start;
}
while (start != end) {
if (*start >= '0' && *start <= '9') {
int_part = int_part * 10 + (*start - '0');
} else if (*start == '.') {
has_frac = true;
++start;
break;
} else if (*start == 'e') {
has_exp = true;
++start;
break;
} else {
return sign * int_part;
}
++start;
}
if (has_frac) {
double frac_exp = 0.1;
while (start != end) {
if (*start >= '0' && *start <= '9') {
frac_part += frac_exp * (*start - '0');
frac_exp *= 0.1;
} else if (*start == 'e') {
has_exp = true;
++start;
break;
} else {
return sign * (int_part + frac_part);
}
++start;
}
}
// parsing exponent part
double exp_part = 1.0;
if (start != end && has_exp) {
int exp_sign = 1;
if (*start == '-') {
exp_sign = -1;
++start;
} else if (*start == '+') {
++start;
}
int e = 0;
while (start != end && *start >= '0' && *start <= '9') {
e = e * 10 + *start - '0';
++start;
}
exp_part = pow10(exp_sign * e);
}
return sign * (int_part + frac_part) * exp_part;
}
int main() {
MemoryMappedFile map = MemoryMappedFile("FloatDataset.csv");
const char* curr = map.start();
const char* start = map.start();
const char* const end = map.end();
uintmax_t lines_n = 0;
int cnt = 0;
double sum = 0.0;
while (curr && curr != end) {
if (*curr == ',' || *curr == '\n') {
// std::string fieldstr(start, curr);
// double field = std::stod(fieldstr);
// m_numLines = 11000000 cnt=33000000 sum=16498294753551.9
// real 5.998s
double field = crack_atof(start, curr);
// m_numLines = 11000000 cnt=33000000 sum=16498294753551.9
// real 1.327s
sum += field;
++cnt;
if (*curr == '\n') lines_n++;
curr++;
start = curr;
} else {
++curr;
}
}
std::cout << std::setprecision(15) << "m_numLines = " << lines_n << " cnt=" << cnt
<< " sum=" << sum << "\n";
}
#include <iostream>
#include <random>
#include <iomanip>
std::string filename = "FloatDataset.csv";
const int cols_n = 3;
const int rows_n = 11'000'000;
std::ofstream ofstream(filename);
if (!ofstream.is_open()) {
std::cerr << "failed to open " << filename << '\n';
exit(1);
}
std::random_device rd;
std::mt19937 gen{1}; // rd()};
std::uniform_real_distribution<double> dist{1, 1e6};
ofstream << std::setprecision(15);
for (int r = 0; r < rows_n; r++) {
for (int c = 0; c < cols_n; c++) {
double num = dist(gen);
ofstream << num;
if (c != cols_n - 1) ofstream << ',';
}
ofstream << "\n";
}
ofstream.close();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment