Skip to content

Instantly share code, notes, and snippets.

@saki7
Last active February 25, 2022 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saki7/d9126a5d5c2a6d71c867fec5fb890b09 to your computer and use it in GitHub Desktop.
Save saki7/d9126a5d5c2a6d71c867fec5fb890b09 to your computer and use it in GitHub Desktop.
Public Suffix List (https://publicsuffix.org/) をパースするC++のコードの一例
#include "syn/bridge.hpp"
#include "syn/public_suffix.hpp"
#include <saya/logger.hpp>
#include <boost/range/algorithm/unique.hpp>
#include <boost/range/algorithm/sort.hpp>
#include <boost/range/algorithm_ext/erase.hpp>
#include <boost/algorithm/cxx11/any_of.hpp>
#include <boost/algorithm/string/join.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/assert.hpp>
#include <regex>
#include <fstream>
#include <vector>
#include <algorithm>
namespace syn {
std::regex const PublicSuffix::SUBDOMAIN_RGX(R"([^.]+\.)");
bool PublicSuffix::is_inited_ = false;
std::vector<std::string> PublicSuffix::wildcard_buf_;
std::vector<std::regex> PublicSuffix::whitelist_;
std::vector<PublicSuffix::Regex> PublicSuffix::exact_denied_regexs_;
std::vector<PublicSuffix::Regex> PublicSuffix::denied_regexs_;
std::vector<PublicSuffix::Regex> PublicSuffix::overridden_regexs_;
std::vector<PublicSuffix::Regex> PublicSuffix::valid_regexs_;
void PublicSuffix::init()
{
if (is_inited_) return;
is_inited_ = true;
saya::managed_logger l("PublicSuffix");
static const path_t PS_FILE = bridge().data_dir()/"publicsuffix"/"public_suffix_list.dat";
if (!boost::filesystem::exists(PS_FILE)) {
throw public_suffix_error("Required file \"" + PS_FILE.string() + "\" does not exist");
}
l.info() << "Loading file " << PS_FILE << "..." << std::endl;
std::ifstream ifs(PS_FILE.string(), std::ios::in);
std::regex const prefix_rgx(R"(^(//|\*\.|!))");
std::smatch what;
std::regex_constants::match_flag_type const flags = std::regex_constants::match_default;
// Regexs for escaping dots
std::regex const dot_rgx(R"(\.)");
// Buffer for sorting regexs
struct RegexBuf
{
std::string rgx_str;
std::size_t origin_match_count;
};
std::vector<RegexBuf> exact_denied_regexs_buf, denied_regexs_buf, overridden_regexs_buf, valid_regexs_buf;
while (!ifs.eof()) {
std::string line;
std::getline(ifs, line);
boost::algorithm::trim(line);
if (line.empty()) continue;
std::string::const_iterator
start = line.cbegin(),
end = line.cend(),
last_match_end
;
// l.info() << line << std::endl;
MatchType match_type = MATCH_NONE;
std::size_t match_length;
while (std::regex_search(start, end, what, prefix_rgx, flags)) {
if (what[0].matched) {
// l.warn() << "Matched " << what[0] << std::endl;
auto const& m = what[0].str();
match_length = m.size();
if (m == "//") {
match_type = MATCH_COMMENT;
} else if (m == "*.") {
match_type = MATCH_WILDCARD;
} else if (m == "!") {
match_type = MATCH_EXCLAMATION;
}
}
start = end;
}
switch (match_type) {
case MATCH_COMMENT:
continue;
case MATCH_WILDCARD:
case MATCH_EXCLAMATION:
line = std::string(line.begin() + match_length, line.end());
break;
}
if (match_type == MATCH_WILDCARD) {
wildcard_buf_.push_back(line);
}
// Escape a dot in a suffix
line = std::regex_replace(line, dot_rgx, R"(\.)");
// Add a trailing $ (\z is not supported)
line += R"($)";
switch (match_type) {
case MATCH_NONE:
valid_regexs_buf.push_back({R"((?:[^.]+\.)+)" + line, 0});
break;
case MATCH_WILDCARD: {
std::string::const_iterator
start = line.cbegin(),
end = line.cend()
;
std::smatch what;
std::size_t origin_match_count = 0;
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) {
++origin_match_count;
start = what[0].second;
}
exact_denied_regexs_buf.push_back({R"(^(?:[^.]+\.)?)" + line, origin_match_count});
denied_regexs_buf.push_back({R"((?:[^.]+\.)*)" + line, origin_match_count});
break;
}
case MATCH_EXCLAMATION:
whitelist_.push_back(std::regex(line));
overridden_regexs_buf.push_back({R"((?:[^.]+\.)*)" + line, 0});
break;
}
// l.warn() << "Line: " << line << std::endl;
}
l.info() << "Sorting by length..." << std::endl;
boost::sort(exact_denied_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) {
return lhs.rgx_str.size() > rhs.rgx_str.size();
});
boost::sort(denied_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) {
return lhs.rgx_str.size() > rhs.rgx_str.size();
});
boost::sort(overridden_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) {
return lhs.rgx_str.size() > rhs.rgx_str.size();
});
boost::sort(valid_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) {
return lhs.rgx_str.size() > rhs.rgx_str.size();
});
l.info() << "Sorted." << std::endl;
for (auto const& r : exact_denied_regexs_buf) {
exact_denied_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count});
}
for (auto const& r : denied_regexs_buf) {
denied_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count});
}
for (auto const& r : overridden_regexs_buf) {
overridden_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count});
}
for (auto const& r : valid_regexs_buf) {
valid_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count});
}
exact_denied_regexs_buf.clear();
denied_regexs_buf.clear();
overridden_regexs_buf.clear();
valid_regexs_buf.clear();
l.info() << "Loaded successfully!" << std::endl;
// {
// std::cmatch what;
// l.warn() << std::boolalpha
// << std::regex_match(
// "foo.githubcloudusercontent.com",
// what,
// std::regex(R"([^.]+\.githubcloudusercontent\.com$)")
// )
// << std::endl;
//
// l.warn() << what[0] << std::endl;
// }
}
std::vector<std::string>
PublicSuffix::to_toplevels(std::string const& domain)
{
if (!is_inited_) {
throw public_suffix_not_initialized_error("to_toplevels called before init()");
}
if (domain == "localhost") throw unknown_domain_error(domain);
std::vector<std::string> toplevels;
bool is_denied = false;
for (auto const& rgx : denied_regexs_) {
if (std::regex_match(domain, rgx.rgx)) {
std::string::const_iterator
start = domain.cbegin(),
end = domain.cend()
;
std::smatch what;
std::size_t count = 0;
std::vector<std::string> toplevels_buf;
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) {
toplevels_buf.push_back(std::string(start, end));
++count;
start = what[0].second;
}
if (count == rgx.origin_match_count) {
throw denied_domain_error(domain);
}
if (count > rgx.origin_match_count + 1) {
toplevels = std::vector<std::string>(toplevels_buf.begin(), toplevels_buf.begin() + (count - rgx.origin_match_count - 1));
is_denied = false;
} else {
is_denied = true;
}
break;
}
}
bool is_overridden = false;
if (is_denied) {
for (auto const& orgx : overridden_regexs_) {
if (std::regex_match(domain, orgx.rgx)) {
is_denied = false;
std::string::const_iterator
start = domain.cbegin(),
end = domain.cend()
;
std::smatch what;
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) {
toplevels.push_back(std::string(start, end));
start = what[0].second;
}
is_overridden = true;
BOOST_ASSERT(!toplevels.empty());
break;
}
}
// remove the denied parent domain (again)
boost::remove_erase_if(toplevels, [] (std::string const& toplevel) {
for (auto const& rgx : exact_denied_regexs_) {
for (auto const& wrgx : whitelist_) {
if (std::regex_match(toplevel, wrgx)) return false;
}
if (std::regex_match(toplevel, rgx.rgx)) {
return true;
}
}
return false;
});
} else {
// Look for remaining subdomains which is NOT denied
std::string::const_iterator
start = domain.cbegin(),
end = domain.cend()
;
std::smatch what;
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) {
auto const subdomain = std::string(start, end);
// std::regex const wrgx(
// R"((?:[^.]+\.)*)" +
// std::regex_replace(subdomain, std::regex(R"(\.)"), R"(\.)")
// );
for (auto const& orgx : overridden_regexs_) {
if (
!boost::algorithm::any_of_equal(wildcard_buf_, subdomain) &&
std::regex_match(subdomain, orgx.rgx)
) {
for (auto const& vrgx : valid_regexs_) {
if (std::regex_match(subdomain, vrgx.rgx)) {
toplevels.push_back(subdomain);
}
}
}
}
start = what[0].second;
}
}
if (is_denied) {
throw denied_domain_error(domain);
} else {
if (!toplevels.empty() || is_overridden) {
boost::erase(toplevels, boost::unique<boost::return_found_end>(boost::sort(toplevels, [] (std::string const& lhs, std::string const& rhs) {
return lhs.size() > rhs.size();
})));
return toplevels;
}
}
for (auto const& rgx : valid_regexs_) {
std::smatch what;
if (std::regex_match(domain, what, rgx.rgx)) {
std::string::const_iterator
start = domain.cbegin(),
end = domain.cend()
;
std::smatch what2;
while (std::regex_search(start, end, what2, SUBDOMAIN_RGX)) {
toplevels.push_back(std::string(start, end));
start = what2[0].second;
}
boost::erase(toplevels, boost::unique<boost::return_found_end>(boost::sort(toplevels, [] (std::string const& lhs, std::string const& rhs) {
return lhs.size() > rhs.size();
})));
return toplevels;
}
}
throw unknown_domain_error(domain);
}
} // syn
#ifndef SYN_PUBLICSUFFIX_HPP_
#define SYN_PUBLICSUFFIX_HPP_
#include "syn/error.hpp"
#include <vector>
#include <regex>
namespace syn {
struct public_suffix_error : syn_error
{
using syn_error::syn_error;
};
struct public_suffix_not_initialized_error : public_suffix_error
{
using public_suffix_error::public_suffix_error;
};
struct denied_domain_error : public_suffix_error
{
denied_domain_error(std::string const& domain)
: public_suffix_error("Denied domain: " + domain)
{}
};
struct unknown_domain_error : public_suffix_error
{
unknown_domain_error(std::string const& domain)
: public_suffix_error("Unknown domain: " + domain)
{}
};
class PublicSuffix
{
public:
// static PublicSuffix const& get() { static PublicSuffix instance; return instance; }
static void init();
static bool is_inited() { return is_inited_; }
// static std::vector<std::regex> const& regexs() { return regexs_; }
static std::vector<std::string> to_toplevels(std::string const& domain);
private:
enum MatchType
{
MATCH_NONE,
MATCH_COMMENT,
MATCH_WILDCARD,
MATCH_EXCLAMATION,
};
struct Regex
{
std::regex rgx;
std::size_t origin_match_count;
};
static std::regex const SUBDOMAIN_RGX;
PublicSuffix() = default;
static void test();
static bool is_inited_;
static std::vector<std::string> wildcard_buf_;
static std::vector<std::regex> whitelist_;
static std::vector<Regex>
exact_denied_regexs_, denied_regexs_, overridden_regexs_, valid_regexs_;
};
} // syn
#endif
#include "syn/public_suffix.hpp"
#include <saya/logger.hpp>
#include "gtest/gtest.h"
TEST(PublicSuffix, to_toplevels)
{
saya::managed_logger l("PublicSuffixTest");
l.info() << "Testing..." << std::endl;
EXPECT_THROW({
syn::PublicSuffix::to_toplevels("");
}, syn::public_suffix_not_initialized_error);
syn::PublicSuffix::init();
// denied
EXPECT_THROW({
std::string t("githubcloudusercontent.com");
auto const t_t = syn::PublicSuffix::to_toplevels(t);
}, syn::denied_domain_error);
// denied
EXPECT_THROW({
std::string t0("foo.githubcloudusercontent.com");
auto const t0_t = syn::PublicSuffix::to_toplevels(t0);
}, syn::denied_domain_error);
// valid
EXPECT_NO_THROW({
std::string t1("bar.foo.githubcloudusercontent.com");
auto const t1_t = syn::PublicSuffix::to_toplevels(t1);
for (auto const& t : t1_t) {
l.info() << "t1: " << t << std::endl;
}
ASSERT_EQ(t1_t.size(), std::size_t(1));
EXPECT_EQ(t1_t[0], "bar.foo.githubcloudusercontent.com");
});
// valid too
EXPECT_NO_THROW({
std::string t2("baz.bar.foo.githubcloudusercontent.com");
auto const t2_t = syn::PublicSuffix::to_toplevels(t2);
for (auto const& t : t2_t) {
l.info() << "t2: " << t << std::endl;
}
ASSERT_EQ(t2_t.size(), std::size_t(2));
EXPECT_EQ(t2_t[0], "baz.bar.foo.githubcloudusercontent.com");
EXPECT_EQ(t2_t[1], "bar.foo.githubcloudusercontent.com");
});
// denied
EXPECT_THROW({
std::string t3_0("kawasaki.jp");
auto const t3_0_t = syn::PublicSuffix::to_toplevels(t3_0);
}, syn::denied_domain_error);
// valid
EXPECT_NO_THROW({
std::string t3("city.kawasaki.jp");
auto const t3_t = syn::PublicSuffix::to_toplevels(t3);
for (auto const& t : t3_t) {
l.info() << "t3: " << t << std::endl;
}
ASSERT_EQ(t3_t.size(), std::size_t(1));
EXPECT_EQ(t3_t[0], "city.kawasaki.jp");
});
// valid too
EXPECT_NO_THROW({
std::string t4("www.city.kawasaki.jp");
auto const t4_t = syn::PublicSuffix::to_toplevels(t4);
for (auto const& t : t4_t) {
l.info() << "t4: " << t << std::endl;
}
ASSERT_EQ(t4_t.size(), std::size_t(2));
EXPECT_EQ(t4_t[0], "www.city.kawasaki.jp");
EXPECT_EQ(t4_t[1], "city.kawasaki.jp");
});
// valid too
EXPECT_NO_THROW({
std::string t4_2("www3.www2.www.city.kawasaki.jp");
auto const t4_2_t = syn::PublicSuffix::to_toplevels(t4_2);
for (auto const& t : t4_2_t) {
l.info() << "t4_2: " << t << std::endl;
}
ASSERT_EQ(t4_2_t.size(), std::size_t(4));
EXPECT_EQ(t4_2_t[0], "www3.www2.www.city.kawasaki.jp");
EXPECT_EQ(t4_2_t[1], "www2.www.city.kawasaki.jp");
EXPECT_EQ(t4_2_t[2], "www.city.kawasaki.jp");
EXPECT_EQ(t4_2_t[3], "city.kawasaki.jp");
});
// valid too
EXPECT_NO_THROW({
std::string t5("www3.www2.www.saki7.jp");
auto const t5_t = syn::PublicSuffix::to_toplevels(t5);
for (auto const& t : t5_t) {
l.info() << "t5: " << t << std::endl;
}
ASSERT_EQ(t5_t.size(), std::size_t(4));
EXPECT_EQ(t5_t[0], "www3.www2.www.saki7.jp");
EXPECT_EQ(t5_t[1], "www2.www.saki7.jp");
EXPECT_EQ(t5_t[2], "www.saki7.jp");
EXPECT_EQ(t5_t[3], "saki7.jp");
});
// valid too
EXPECT_NO_THROW({
std::string t5_2("saki7.jp");
auto const t5_2_t = syn::PublicSuffix::to_toplevels(t5_2);
for (auto const& t : t5_2_t) {
l.info() << "t5_2: " << t << std::endl;
}
ASSERT_EQ(t5_2_t.size(), std::size_t(1));
EXPECT_EQ(t5_2_t[0], "saki7.jp");
});
// unknown
EXPECT_THROW({
std::string t6("foo.foobarbaz123321123321555551234q#?!");
auto const t6_t = syn::PublicSuffix::to_toplevels(t6);
}, syn::unknown_domain_error);
EXPECT_THROW({
std::string t7("localhost2");
auto const t7_t = syn::PublicSuffix::to_toplevels(t7);
}, syn::unknown_domain_error);
EXPECT_THROW({
std::string t8("localhost2.");
auto const t8_t = syn::PublicSuffix::to_toplevels(t8);
}, syn::unknown_domain_error);
EXPECT_THROW({
std::string t9("jp");
auto const t9_t = syn::PublicSuffix::to_toplevels(t9);
}, syn::unknown_domain_error);
EXPECT_THROW({
std::string t10(".jp");
auto const t10_t = syn::PublicSuffix::to_toplevels(t10);
}, syn::unknown_domain_error);
EXPECT_THROW({
std::string t11("jp.");
auto const t11_t = syn::PublicSuffix::to_toplevels(t11);
}, syn::unknown_domain_error);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment