Last active
February 13, 2016 05:39
-
-
Save ccbrown/215722eb48d2cfced1e3 to your computer and use it in GitHub Desktop.
extract_palindromes.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <gsl.h> | |
#include <omp.h> | |
#include <thread> | |
#include <vector> | |
#include <sys/stat.h> | |
using namespace std::literals; | |
std::size_t file_size(gsl::not_null<gsl::czstring<>> name) { | |
struct stat stat_buf; | |
if (stat(name, &stat_buf)) { | |
throw std::runtime_error("unable to stat "s + name.get()); | |
} | |
return stat_buf.st_size; | |
} | |
void read_file(gsl::not_null<gsl::czstring<>> name, gsl::span<gsl::byte> destination, gsl::not_null<std::atomic<std::size_t>*> bytes_read) { | |
auto f = fopen(name, "r"); | |
if (!f) { | |
throw std::runtime_error("unable to open "s + name.get()); | |
} | |
auto _ = gsl::finally([f] { fclose(f); }); | |
constexpr std::size_t read_size = 16 * 1024; | |
std::size_t written = 0; | |
while (!feof(f)) { | |
auto max_read_size = std::min(read_size, destination.size() - written); | |
if (!max_read_size) { | |
char eof_test; | |
if (fread(&eof_test, 1, 1, f)) { | |
throw std::runtime_error("destination too small for "s + name.get()); | |
} | |
} else { | |
auto result = fread(destination.data() + written, 1, max_read_size, f); | |
written += result; | |
*bytes_read += result; | |
} | |
if (ferror(f)) { | |
throw std::runtime_error("error reading "s + name.get()); | |
} | |
} | |
} | |
bool is_palindrome(gsl::span<const gsl::byte> bytes) noexcept { | |
return std::equal(bytes.begin(), bytes.begin() + bytes.size() / 2, bytes.rbegin()); | |
} | |
std::vector<gsl::span<const gsl::byte>> palindromes(gsl::span<const gsl::byte> contents) noexcept { | |
auto ptr = contents.data(); | |
std::size_t rem = contents.size(); | |
std::size_t lines = 0; | |
std::vector<gsl::span<const gsl::byte>> ret; | |
while (rem) { | |
auto newline = reinterpret_cast<gsl::byte*>(memchr(ptr, '\n', rem)); | |
auto length = newline ? newline - ptr : rem; | |
++lines; | |
if (is_palindrome(gsl::as_span(ptr, length))) { | |
ret.emplace_back(ptr, length); | |
} | |
if (!newline) { break; } | |
rem -= length + 1; | |
ptr = ++newline; | |
} | |
return ret; | |
} | |
void extract_palindromes(gsl::not_null<gsl::czstring<>> input_path, gsl::not_null<gsl::czstring<>> output_path) { | |
auto output_file = fopen(output_path, "w"); | |
if (!output_file) { | |
throw std::runtime_error("unable to open "s + output_path.get() + " for writing"); | |
} | |
auto _1 = gsl::finally([output_file] { fclose(output_file); }); | |
gsl::byte* input_buffer = nullptr; | |
std::size_t input_size = 0; | |
std::tie(input_buffer, input_size) = std::get_temporary_buffer<gsl::byte>(file_size(input_path)); | |
auto _2 = gsl::finally([=] { std::return_temporary_buffer(input_buffer); }); | |
std::atomic<std::size_t> input_bytes_read{0}; | |
read_file(input_path, gsl::as_span(input_buffer, input_size), &input_bytes_read); | |
auto results = palindromes(gsl::as_span(input_buffer, input_bytes_read)); | |
for (auto& palindrome : results) { | |
if (fwrite(palindrome.data(), 1, palindrome.size(), output_file) < palindrome.size() || fwrite("\n", 1, 1, output_file) < 1) { | |
throw std::runtime_error("error writing to "s + output_path.get()); | |
} | |
} | |
} | |
void extract_palindromes_mp(gsl::not_null<gsl::czstring<>> input_path, gsl::not_null<gsl::czstring<>> output_path) { | |
auto output_file = fopen(output_path, "w"); | |
if (!output_file) { | |
throw std::runtime_error("unable to open "s + output_path.get() + " for writing"); | |
} | |
auto _1 = gsl::finally([output_file] { fclose(output_file); }); | |
gsl::byte* input_buffer = nullptr; | |
std::size_t input_size = 0; | |
std::tie(input_buffer, input_size) = std::get_temporary_buffer<gsl::byte>(file_size(input_path)); | |
auto _2 = gsl::finally([=] { std::return_temporary_buffer(input_buffer); }); | |
std::atomic<std::size_t> input_bytes_read{0}; | |
std::atomic<bool> input_read_finished{false}; | |
std::thread input_thread([&] { | |
read_file(input_path, gsl::as_span(input_buffer, input_size), &input_bytes_read); | |
input_read_finished = true; | |
}); | |
std::size_t input_bytes_consumed = 0; | |
#pragma omp parallel | |
{ | |
bool is_finished = false; | |
while (!is_finished) { | |
gsl::span<const gsl::byte> input; | |
constexpr std::size_t preferred_min_input_size = 4 * 1024; | |
#pragma omp critical | |
{ | |
auto input_read_finished_snapshot = input_read_finished.load(); | |
auto input_bytes_read_snapshot = input_bytes_read.load(); | |
auto input_bytes_available = input_bytes_read_snapshot - input_bytes_consumed; | |
if (input_bytes_available > preferred_min_input_size) { | |
auto preferred_input_size = std::max(input_bytes_available / omp_get_num_threads(), preferred_min_input_size); | |
if (auto end = reinterpret_cast<const gsl::byte*>(memchr(input_buffer + input_bytes_consumed + preferred_input_size, '\n', input_bytes_available - preferred_input_size))) { | |
input = gsl::as_span(input_buffer + input_bytes_consumed, end - (input_buffer + input_bytes_consumed) + 1); | |
input_bytes_consumed += input.size(); | |
} | |
} else if (input_read_finished_snapshot) { | |
if (input_bytes_available) { | |
input = gsl::as_span(input_buffer + input_bytes_consumed, input_bytes_available); | |
input_bytes_consumed = input_bytes_read_snapshot; | |
} | |
is_finished = true; | |
} | |
} | |
if (!input.empty()) { | |
auto results = palindromes(input); | |
for (auto& palindrome : results) { | |
flockfile(output_file); | |
auto _ = gsl::finally([=] { funlockfile(output_file); }); | |
if (fwrite(palindrome.data(), 1, palindrome.size(), output_file) < palindrome.size() || fwrite("\n", 1, 1, output_file) < 1) { | |
throw std::runtime_error("error writing to "s + output_path.get()); | |
} | |
} | |
} else { | |
std::this_thread::sleep_for(100ms); | |
} | |
} | |
} | |
input_thread.join(); | |
} | |
int main(int argc, gsl::czstring<> argv[]) { | |
if (argc < 3) { | |
throw std::runtime_error("usage: "s + argv[0] + " input_file output_file"s); | |
} | |
extract_palindromes_mp(argv[1], argv[2]); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment