Skip to content

Instantly share code, notes, and snippets.

@lemire
Created May 5, 2023 16:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lemire/4e3b5322878fb9da74b3734973ec955d to your computer and use it in GitHub Desktop.
Save lemire/4e3b5322878fb9da74b3734973ec955d to your computer and use it in GitHub Desktop.
fast piping to ada...
#include <chrono>
#include <fstream>
#include <iostream>
#include <unistd.h>
#include <cstring>
#include <string_view>
#include <memory>
#include "ada.h"
struct line_iterator {
std::string_view all_text;
size_t next_end_of_line{0};
line_iterator(const char *_buffer, size_t _len) : all_text(_buffer, _len) {}
inline bool find_another_complete_line() noexcept {
next_end_of_line = all_text.find('\n');
return next_end_of_line != std::string_view::npos;
}
inline operator bool() const noexcept {
return next_end_of_line != std::string_view::npos;
}
inline std::string_view grab_line() noexcept {
std::string_view thisline = all_text.substr(0, next_end_of_line);
all_text.remove_prefix(next_end_of_line + 1);
return thisline;
}
inline size_t tail() const noexcept { return all_text.size(); }
};
uint64_t nano() {
return std::chrono::duration_cast<::std::chrono::nanoseconds>(
std::chrono::steady_clock::now().time_since_epoch())
.count();
}
int main() {
std::ios_base::sync_with_stdio(false);
constexpr size_t cache_length = 32768;
std::unique_ptr<char[]> cachebuffer(new char[cache_length]{});
uint64_t before = nano();
size_t howmany = 0;
size_t tr;
size_t offset = 0;
size_t lines = 0;
size_t sum_of_lines = 0;
size_t blocks = 0;
while ((tr = read(0, cachebuffer.get() + offset, cache_length - offset))) {
howmany += tr;
blocks++;
size_t capacity = tr + offset;
line_iterator li(cachebuffer.get(), capacity);
while (li.find_another_complete_line()) {
std::string_view line = li.grab_line();
ada::result<ada::url_aggregator> url = ada::parse(line);
if (!url) {
printf("Invalid URL: %.*s\n", int(line.size()), line.data());
}
sum_of_lines += line.size() + 1;
lines++;
}
if ((offset = li.tail()) > 0) {
memmove(cachebuffer.get(), cachebuffer.get() + capacity - offset, offset);
}
}
if (offset > 0) {
// have a line of length offset at cachebuffer.get()
ada::result<ada::url_aggregator> url =
ada::parse(std::string_view(cachebuffer.get(), offset));
if (!url) {
printf("Invalid URL: %.*s\n", int(offset), cachebuffer.get());
}
lines++;
sum_of_lines += offset;
}
uint64_t after = nano();
double giga = howmany / 1000000000.;
std::cout << "read " << howmany << " bytes in " << (after - before)
<< " ns using " << lines << " lines, sum_of_lines is "
<< sum_of_lines << " used " << blocks << " loads" << std::endl;
double seconds = (after - before) / 1000000000.;
double speed = giga / seconds;
std::cout << speed << " GB/s" << std::endl;
return EXIT_SUCCESS;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment