Last active
February 25, 2022 11:43
-
-
Save saki7/d9126a5d5c2a6d71c867fec5fb890b09 to your computer and use it in GitHub Desktop.
Public Suffix List (https://publicsuffix.org/) をパースするC++のコードの一例
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "syn/bridge.hpp" | |
#include "syn/public_suffix.hpp" | |
#include <saya/logger.hpp> | |
#include <boost/range/algorithm/unique.hpp> | |
#include <boost/range/algorithm/sort.hpp> | |
#include <boost/range/algorithm_ext/erase.hpp> | |
#include <boost/algorithm/cxx11/any_of.hpp> | |
#include <boost/algorithm/string/join.hpp> | |
#include <boost/algorithm/string/trim.hpp> | |
#include <boost/assert.hpp> | |
#include <regex> | |
#include <fstream> | |
#include <vector> | |
#include <algorithm> | |
namespace syn { | |
std::regex const PublicSuffix::SUBDOMAIN_RGX(R"([^.]+\.)"); | |
bool PublicSuffix::is_inited_ = false; | |
std::vector<std::string> PublicSuffix::wildcard_buf_; | |
std::vector<std::regex> PublicSuffix::whitelist_; | |
std::vector<PublicSuffix::Regex> PublicSuffix::exact_denied_regexs_; | |
std::vector<PublicSuffix::Regex> PublicSuffix::denied_regexs_; | |
std::vector<PublicSuffix::Regex> PublicSuffix::overridden_regexs_; | |
std::vector<PublicSuffix::Regex> PublicSuffix::valid_regexs_; | |
void PublicSuffix::init() | |
{ | |
if (is_inited_) return; | |
is_inited_ = true; | |
saya::managed_logger l("PublicSuffix"); | |
static const path_t PS_FILE = bridge().data_dir()/"publicsuffix"/"public_suffix_list.dat"; | |
if (!boost::filesystem::exists(PS_FILE)) { | |
throw public_suffix_error("Required file \"" + PS_FILE.string() + "\" does not exist"); | |
} | |
l.info() << "Loading file " << PS_FILE << "..." << std::endl; | |
std::ifstream ifs(PS_FILE.string(), std::ios::in); | |
std::regex const prefix_rgx(R"(^(//|\*\.|!))"); | |
std::smatch what; | |
std::regex_constants::match_flag_type const flags = std::regex_constants::match_default; | |
// Regexs for escaping dots | |
std::regex const dot_rgx(R"(\.)"); | |
// Buffer for sorting regexs | |
struct RegexBuf | |
{ | |
std::string rgx_str; | |
std::size_t origin_match_count; | |
}; | |
std::vector<RegexBuf> exact_denied_regexs_buf, denied_regexs_buf, overridden_regexs_buf, valid_regexs_buf; | |
while (!ifs.eof()) { | |
std::string line; | |
std::getline(ifs, line); | |
boost::algorithm::trim(line); | |
if (line.empty()) continue; | |
std::string::const_iterator | |
start = line.cbegin(), | |
end = line.cend(), | |
last_match_end | |
; | |
// l.info() << line << std::endl; | |
MatchType match_type = MATCH_NONE; | |
std::size_t match_length; | |
while (std::regex_search(start, end, what, prefix_rgx, flags)) { | |
if (what[0].matched) { | |
// l.warn() << "Matched " << what[0] << std::endl; | |
auto const& m = what[0].str(); | |
match_length = m.size(); | |
if (m == "//") { | |
match_type = MATCH_COMMENT; | |
} else if (m == "*.") { | |
match_type = MATCH_WILDCARD; | |
} else if (m == "!") { | |
match_type = MATCH_EXCLAMATION; | |
} | |
} | |
start = end; | |
} | |
switch (match_type) { | |
case MATCH_COMMENT: | |
continue; | |
case MATCH_WILDCARD: | |
case MATCH_EXCLAMATION: | |
line = std::string(line.begin() + match_length, line.end()); | |
break; | |
} | |
if (match_type == MATCH_WILDCARD) { | |
wildcard_buf_.push_back(line); | |
} | |
// Escape a dot in a suffix | |
line = std::regex_replace(line, dot_rgx, R"(\.)"); | |
// Add a trailing $ (\z is not supported) | |
line += R"($)"; | |
switch (match_type) { | |
case MATCH_NONE: | |
valid_regexs_buf.push_back({R"((?:[^.]+\.)+)" + line, 0}); | |
break; | |
case MATCH_WILDCARD: { | |
std::string::const_iterator | |
start = line.cbegin(), | |
end = line.cend() | |
; | |
std::smatch what; | |
std::size_t origin_match_count = 0; | |
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) { | |
++origin_match_count; | |
start = what[0].second; | |
} | |
exact_denied_regexs_buf.push_back({R"(^(?:[^.]+\.)?)" + line, origin_match_count}); | |
denied_regexs_buf.push_back({R"((?:[^.]+\.)*)" + line, origin_match_count}); | |
break; | |
} | |
case MATCH_EXCLAMATION: | |
whitelist_.push_back(std::regex(line)); | |
overridden_regexs_buf.push_back({R"((?:[^.]+\.)*)" + line, 0}); | |
break; | |
} | |
// l.warn() << "Line: " << line << std::endl; | |
} | |
l.info() << "Sorting by length..." << std::endl; | |
boost::sort(exact_denied_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) { | |
return lhs.rgx_str.size() > rhs.rgx_str.size(); | |
}); | |
boost::sort(denied_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) { | |
return lhs.rgx_str.size() > rhs.rgx_str.size(); | |
}); | |
boost::sort(overridden_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) { | |
return lhs.rgx_str.size() > rhs.rgx_str.size(); | |
}); | |
boost::sort(valid_regexs_buf, [] (RegexBuf const& lhs, RegexBuf const& rhs) { | |
return lhs.rgx_str.size() > rhs.rgx_str.size(); | |
}); | |
l.info() << "Sorted." << std::endl; | |
for (auto const& r : exact_denied_regexs_buf) { | |
exact_denied_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count}); | |
} | |
for (auto const& r : denied_regexs_buf) { | |
denied_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count}); | |
} | |
for (auto const& r : overridden_regexs_buf) { | |
overridden_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count}); | |
} | |
for (auto const& r : valid_regexs_buf) { | |
valid_regexs_.push_back({std::regex(r.rgx_str), r.origin_match_count}); | |
} | |
exact_denied_regexs_buf.clear(); | |
denied_regexs_buf.clear(); | |
overridden_regexs_buf.clear(); | |
valid_regexs_buf.clear(); | |
l.info() << "Loaded successfully!" << std::endl; | |
// { | |
// std::cmatch what; | |
// l.warn() << std::boolalpha | |
// << std::regex_match( | |
// "foo.githubcloudusercontent.com", | |
// what, | |
// std::regex(R"([^.]+\.githubcloudusercontent\.com$)") | |
// ) | |
// << std::endl; | |
// | |
// l.warn() << what[0] << std::endl; | |
// } | |
} | |
std::vector<std::string> | |
PublicSuffix::to_toplevels(std::string const& domain) | |
{ | |
if (!is_inited_) { | |
throw public_suffix_not_initialized_error("to_toplevels called before init()"); | |
} | |
if (domain == "localhost") throw unknown_domain_error(domain); | |
std::vector<std::string> toplevels; | |
bool is_denied = false; | |
for (auto const& rgx : denied_regexs_) { | |
if (std::regex_match(domain, rgx.rgx)) { | |
std::string::const_iterator | |
start = domain.cbegin(), | |
end = domain.cend() | |
; | |
std::smatch what; | |
std::size_t count = 0; | |
std::vector<std::string> toplevels_buf; | |
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) { | |
toplevels_buf.push_back(std::string(start, end)); | |
++count; | |
start = what[0].second; | |
} | |
if (count == rgx.origin_match_count) { | |
throw denied_domain_error(domain); | |
} | |
if (count > rgx.origin_match_count + 1) { | |
toplevels = std::vector<std::string>(toplevels_buf.begin(), toplevels_buf.begin() + (count - rgx.origin_match_count - 1)); | |
is_denied = false; | |
} else { | |
is_denied = true; | |
} | |
break; | |
} | |
} | |
bool is_overridden = false; | |
if (is_denied) { | |
for (auto const& orgx : overridden_regexs_) { | |
if (std::regex_match(domain, orgx.rgx)) { | |
is_denied = false; | |
std::string::const_iterator | |
start = domain.cbegin(), | |
end = domain.cend() | |
; | |
std::smatch what; | |
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) { | |
toplevels.push_back(std::string(start, end)); | |
start = what[0].second; | |
} | |
is_overridden = true; | |
BOOST_ASSERT(!toplevels.empty()); | |
break; | |
} | |
} | |
// remove the denied parent domain (again) | |
boost::remove_erase_if(toplevels, [] (std::string const& toplevel) { | |
for (auto const& rgx : exact_denied_regexs_) { | |
for (auto const& wrgx : whitelist_) { | |
if (std::regex_match(toplevel, wrgx)) return false; | |
} | |
if (std::regex_match(toplevel, rgx.rgx)) { | |
return true; | |
} | |
} | |
return false; | |
}); | |
} else { | |
// Look for remaining subdomains which is NOT denied | |
std::string::const_iterator | |
start = domain.cbegin(), | |
end = domain.cend() | |
; | |
std::smatch what; | |
while (std::regex_search(start, end, what, SUBDOMAIN_RGX)) { | |
auto const subdomain = std::string(start, end); | |
// std::regex const wrgx( | |
// R"((?:[^.]+\.)*)" + | |
// std::regex_replace(subdomain, std::regex(R"(\.)"), R"(\.)") | |
// ); | |
for (auto const& orgx : overridden_regexs_) { | |
if ( | |
!boost::algorithm::any_of_equal(wildcard_buf_, subdomain) && | |
std::regex_match(subdomain, orgx.rgx) | |
) { | |
for (auto const& vrgx : valid_regexs_) { | |
if (std::regex_match(subdomain, vrgx.rgx)) { | |
toplevels.push_back(subdomain); | |
} | |
} | |
} | |
} | |
start = what[0].second; | |
} | |
} | |
if (is_denied) { | |
throw denied_domain_error(domain); | |
} else { | |
if (!toplevels.empty() || is_overridden) { | |
boost::erase(toplevels, boost::unique<boost::return_found_end>(boost::sort(toplevels, [] (std::string const& lhs, std::string const& rhs) { | |
return lhs.size() > rhs.size(); | |
}))); | |
return toplevels; | |
} | |
} | |
for (auto const& rgx : valid_regexs_) { | |
std::smatch what; | |
if (std::regex_match(domain, what, rgx.rgx)) { | |
std::string::const_iterator | |
start = domain.cbegin(), | |
end = domain.cend() | |
; | |
std::smatch what2; | |
while (std::regex_search(start, end, what2, SUBDOMAIN_RGX)) { | |
toplevels.push_back(std::string(start, end)); | |
start = what2[0].second; | |
} | |
boost::erase(toplevels, boost::unique<boost::return_found_end>(boost::sort(toplevels, [] (std::string const& lhs, std::string const& rhs) { | |
return lhs.size() > rhs.size(); | |
}))); | |
return toplevels; | |
} | |
} | |
throw unknown_domain_error(domain); | |
} | |
} // syn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef SYN_PUBLICSUFFIX_HPP_ | |
#define SYN_PUBLICSUFFIX_HPP_ | |
#include "syn/error.hpp" | |
#include <vector> | |
#include <regex> | |
namespace syn { | |
struct public_suffix_error : syn_error | |
{ | |
using syn_error::syn_error; | |
}; | |
struct public_suffix_not_initialized_error : public_suffix_error | |
{ | |
using public_suffix_error::public_suffix_error; | |
}; | |
struct denied_domain_error : public_suffix_error | |
{ | |
denied_domain_error(std::string const& domain) | |
: public_suffix_error("Denied domain: " + domain) | |
{} | |
}; | |
struct unknown_domain_error : public_suffix_error | |
{ | |
unknown_domain_error(std::string const& domain) | |
: public_suffix_error("Unknown domain: " + domain) | |
{} | |
}; | |
class PublicSuffix | |
{ | |
public: | |
// static PublicSuffix const& get() { static PublicSuffix instance; return instance; } | |
static void init(); | |
static bool is_inited() { return is_inited_; } | |
// static std::vector<std::regex> const& regexs() { return regexs_; } | |
static std::vector<std::string> to_toplevels(std::string const& domain); | |
private: | |
enum MatchType | |
{ | |
MATCH_NONE, | |
MATCH_COMMENT, | |
MATCH_WILDCARD, | |
MATCH_EXCLAMATION, | |
}; | |
struct Regex | |
{ | |
std::regex rgx; | |
std::size_t origin_match_count; | |
}; | |
static std::regex const SUBDOMAIN_RGX; | |
PublicSuffix() = default; | |
static void test(); | |
static bool is_inited_; | |
static std::vector<std::string> wildcard_buf_; | |
static std::vector<std::regex> whitelist_; | |
static std::vector<Regex> | |
exact_denied_regexs_, denied_regexs_, overridden_regexs_, valid_regexs_; | |
}; | |
} // syn | |
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "syn/public_suffix.hpp" | |
#include <saya/logger.hpp> | |
#include "gtest/gtest.h" | |
TEST(PublicSuffix, to_toplevels) | |
{ | |
saya::managed_logger l("PublicSuffixTest"); | |
l.info() << "Testing..." << std::endl; | |
EXPECT_THROW({ | |
syn::PublicSuffix::to_toplevels(""); | |
}, syn::public_suffix_not_initialized_error); | |
syn::PublicSuffix::init(); | |
// denied | |
EXPECT_THROW({ | |
std::string t("githubcloudusercontent.com"); | |
auto const t_t = syn::PublicSuffix::to_toplevels(t); | |
}, syn::denied_domain_error); | |
// denied | |
EXPECT_THROW({ | |
std::string t0("foo.githubcloudusercontent.com"); | |
auto const t0_t = syn::PublicSuffix::to_toplevels(t0); | |
}, syn::denied_domain_error); | |
// valid | |
EXPECT_NO_THROW({ | |
std::string t1("bar.foo.githubcloudusercontent.com"); | |
auto const t1_t = syn::PublicSuffix::to_toplevels(t1); | |
for (auto const& t : t1_t) { | |
l.info() << "t1: " << t << std::endl; | |
} | |
ASSERT_EQ(t1_t.size(), std::size_t(1)); | |
EXPECT_EQ(t1_t[0], "bar.foo.githubcloudusercontent.com"); | |
}); | |
// valid too | |
EXPECT_NO_THROW({ | |
std::string t2("baz.bar.foo.githubcloudusercontent.com"); | |
auto const t2_t = syn::PublicSuffix::to_toplevels(t2); | |
for (auto const& t : t2_t) { | |
l.info() << "t2: " << t << std::endl; | |
} | |
ASSERT_EQ(t2_t.size(), std::size_t(2)); | |
EXPECT_EQ(t2_t[0], "baz.bar.foo.githubcloudusercontent.com"); | |
EXPECT_EQ(t2_t[1], "bar.foo.githubcloudusercontent.com"); | |
}); | |
// denied | |
EXPECT_THROW({ | |
std::string t3_0("kawasaki.jp"); | |
auto const t3_0_t = syn::PublicSuffix::to_toplevels(t3_0); | |
}, syn::denied_domain_error); | |
// valid | |
EXPECT_NO_THROW({ | |
std::string t3("city.kawasaki.jp"); | |
auto const t3_t = syn::PublicSuffix::to_toplevels(t3); | |
for (auto const& t : t3_t) { | |
l.info() << "t3: " << t << std::endl; | |
} | |
ASSERT_EQ(t3_t.size(), std::size_t(1)); | |
EXPECT_EQ(t3_t[0], "city.kawasaki.jp"); | |
}); | |
// valid too | |
EXPECT_NO_THROW({ | |
std::string t4("www.city.kawasaki.jp"); | |
auto const t4_t = syn::PublicSuffix::to_toplevels(t4); | |
for (auto const& t : t4_t) { | |
l.info() << "t4: " << t << std::endl; | |
} | |
ASSERT_EQ(t4_t.size(), std::size_t(2)); | |
EXPECT_EQ(t4_t[0], "www.city.kawasaki.jp"); | |
EXPECT_EQ(t4_t[1], "city.kawasaki.jp"); | |
}); | |
// valid too | |
EXPECT_NO_THROW({ | |
std::string t4_2("www3.www2.www.city.kawasaki.jp"); | |
auto const t4_2_t = syn::PublicSuffix::to_toplevels(t4_2); | |
for (auto const& t : t4_2_t) { | |
l.info() << "t4_2: " << t << std::endl; | |
} | |
ASSERT_EQ(t4_2_t.size(), std::size_t(4)); | |
EXPECT_EQ(t4_2_t[0], "www3.www2.www.city.kawasaki.jp"); | |
EXPECT_EQ(t4_2_t[1], "www2.www.city.kawasaki.jp"); | |
EXPECT_EQ(t4_2_t[2], "www.city.kawasaki.jp"); | |
EXPECT_EQ(t4_2_t[3], "city.kawasaki.jp"); | |
}); | |
// valid too | |
EXPECT_NO_THROW({ | |
std::string t5("www3.www2.www.saki7.jp"); | |
auto const t5_t = syn::PublicSuffix::to_toplevels(t5); | |
for (auto const& t : t5_t) { | |
l.info() << "t5: " << t << std::endl; | |
} | |
ASSERT_EQ(t5_t.size(), std::size_t(4)); | |
EXPECT_EQ(t5_t[0], "www3.www2.www.saki7.jp"); | |
EXPECT_EQ(t5_t[1], "www2.www.saki7.jp"); | |
EXPECT_EQ(t5_t[2], "www.saki7.jp"); | |
EXPECT_EQ(t5_t[3], "saki7.jp"); | |
}); | |
// valid too | |
EXPECT_NO_THROW({ | |
std::string t5_2("saki7.jp"); | |
auto const t5_2_t = syn::PublicSuffix::to_toplevels(t5_2); | |
for (auto const& t : t5_2_t) { | |
l.info() << "t5_2: " << t << std::endl; | |
} | |
ASSERT_EQ(t5_2_t.size(), std::size_t(1)); | |
EXPECT_EQ(t5_2_t[0], "saki7.jp"); | |
}); | |
// unknown | |
EXPECT_THROW({ | |
std::string t6("foo.foobarbaz123321123321555551234q#?!"); | |
auto const t6_t = syn::PublicSuffix::to_toplevels(t6); | |
}, syn::unknown_domain_error); | |
EXPECT_THROW({ | |
std::string t7("localhost2"); | |
auto const t7_t = syn::PublicSuffix::to_toplevels(t7); | |
}, syn::unknown_domain_error); | |
EXPECT_THROW({ | |
std::string t8("localhost2."); | |
auto const t8_t = syn::PublicSuffix::to_toplevels(t8); | |
}, syn::unknown_domain_error); | |
EXPECT_THROW({ | |
std::string t9("jp"); | |
auto const t9_t = syn::PublicSuffix::to_toplevels(t9); | |
}, syn::unknown_domain_error); | |
EXPECT_THROW({ | |
std::string t10(".jp"); | |
auto const t10_t = syn::PublicSuffix::to_toplevels(t10); | |
}, syn::unknown_domain_error); | |
EXPECT_THROW({ | |
std::string t11("jp."); | |
auto const t11_t = syn::PublicSuffix::to_toplevels(t11); | |
}, syn::unknown_domain_error); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment