Created
May 5, 2023 16:09
-
-
Save lemire/4e3b5322878fb9da74b3734973ec955d to your computer and use it in GitHub Desktop.
fast piping to ada...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <chrono> | |
#include <fstream> | |
#include <iostream> | |
#include <unistd.h> | |
#include <cstring> | |
#include <string_view> | |
#include <memory> | |
#include "ada.h" | |
struct line_iterator { | |
std::string_view all_text; | |
size_t next_end_of_line{0}; | |
line_iterator(const char *_buffer, size_t _len) : all_text(_buffer, _len) {} | |
inline bool find_another_complete_line() noexcept { | |
next_end_of_line = all_text.find('\n'); | |
return next_end_of_line != std::string_view::npos; | |
} | |
inline operator bool() const noexcept { | |
return next_end_of_line != std::string_view::npos; | |
} | |
inline std::string_view grab_line() noexcept { | |
std::string_view thisline = all_text.substr(0, next_end_of_line); | |
all_text.remove_prefix(next_end_of_line + 1); | |
return thisline; | |
} | |
inline size_t tail() const noexcept { return all_text.size(); } | |
}; | |
uint64_t nano() { | |
return std::chrono::duration_cast<::std::chrono::nanoseconds>( | |
std::chrono::steady_clock::now().time_since_epoch()) | |
.count(); | |
} | |
int main() { | |
std::ios_base::sync_with_stdio(false); | |
constexpr size_t cache_length = 32768; | |
std::unique_ptr<char[]> cachebuffer(new char[cache_length]{}); | |
uint64_t before = nano(); | |
size_t howmany = 0; | |
size_t tr; | |
size_t offset = 0; | |
size_t lines = 0; | |
size_t sum_of_lines = 0; | |
size_t blocks = 0; | |
while ((tr = read(0, cachebuffer.get() + offset, cache_length - offset))) { | |
howmany += tr; | |
blocks++; | |
size_t capacity = tr + offset; | |
line_iterator li(cachebuffer.get(), capacity); | |
while (li.find_another_complete_line()) { | |
std::string_view line = li.grab_line(); | |
ada::result<ada::url_aggregator> url = ada::parse(line); | |
if (!url) { | |
printf("Invalid URL: %.*s\n", int(line.size()), line.data()); | |
} | |
sum_of_lines += line.size() + 1; | |
lines++; | |
} | |
if ((offset = li.tail()) > 0) { | |
memmove(cachebuffer.get(), cachebuffer.get() + capacity - offset, offset); | |
} | |
} | |
if (offset > 0) { | |
// have a line of length offset at cachebuffer.get() | |
ada::result<ada::url_aggregator> url = | |
ada::parse(std::string_view(cachebuffer.get(), offset)); | |
if (!url) { | |
printf("Invalid URL: %.*s\n", int(offset), cachebuffer.get()); | |
} | |
lines++; | |
sum_of_lines += offset; | |
} | |
uint64_t after = nano(); | |
double giga = howmany / 1000000000.; | |
std::cout << "read " << howmany << " bytes in " << (after - before) | |
<< " ns using " << lines << " lines, sum_of_lines is " | |
<< sum_of_lines << " used " << blocks << " loads" << std::endl; | |
double seconds = (after - before) / 1000000000.; | |
double speed = giga / seconds; | |
std::cout << speed << " GB/s" << std::endl; | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment