Created
February 21, 2024 02:06
-
-
Save zhouyuan/11748111c4fe0d682083b4a13a17fa06 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <unordered_set> | |
#include <algorithm> | |
#include <cctype> | |
class UDFNormalizeString { | |
public: | |
static const std::string DEFAULT_VALUE; | |
static const std::unordered_set<std::string> DEFAULT_NULL_VALUES; | |
private: | |
std::string defaultValue; | |
std::unordered_set<std::string> nullValues; | |
public: | |
UDFNormalizeString() : defaultValue(DEFAULT_VALUE), nullValues(DEFAULT_NULL_VALUES) {} | |
void initialize(const std::vector<std::string>& arguments) { | |
if (arguments.empty()) { | |
throw std::invalid_argument("norm_str() expects at least one argument."); | |
} | |
defaultValue = DEFAULT_VALUE; | |
if (arguments.size() >= 2) { | |
defaultValue = arguments[1]; | |
} | |
nullValues = DEFAULT_NULL_VALUES; | |
for (size_t i = 2; i < arguments.size(); ++i) { | |
if (arguments[i].empty()) { | |
if (i != 2) { | |
throw std::invalid_argument("Only the third null argument will clear the default null values of norm_str()."); | |
} | |
nullValues.clear(); | |
} else { | |
nullValues.insert(toLower(arguments[i])); | |
} | |
} | |
} | |
std::string evaluate(const std::string& input) { | |
if (input.empty()) { | |
return defaultValue; | |
} | |
std::string trimmedInput = trim(input); | |
if (trimmedInput.empty() || nullValues.find(toLower(trimmedInput)) != nullValues.end()) { | |
return defaultValue; | |
} | |
return trimmedInput; | |
} | |
private: | |
static std::string toLower(const std::string& str) { | |
std::string lowerStr = str; | |
std::transform(lowerStr.begin(), lowerStr.end(), lowerStr.begin(), | |
[](unsigned char c){ return std::tolower(c); }); | |
return lowerStr; | |
} | |
static std::string trim(const std::string& str) { | |
size_t first = str.find_first_not_of(' '); | |
if (std::string::npos == first) { | |
return ""; | |
} | |
size_t last = str.find_last_not_of(' '); | |
return str.substr(first, (last - first + 1)); | |
} | |
}; | |
const std::string UDFNormalizeString::DEFAULT_VALUE = "N-A"; | |
const std::unordered_set<std::string> UDFNormalizeString::DEFAULT_NULL_VALUES = {"null", "unknown", "unknow", UDFNormalizeString::DEFAULT_VALUE}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment