Skip to content

Instantly share code, notes, and snippets.

@zhouyuan
Created February 21, 2024 02:06
Show Gist options
  • Save zhouyuan/11748111c4fe0d682083b4a13a17fa06 to your computer and use it in GitHub Desktop.
Save zhouyuan/11748111c4fe0d682083b4a13a17fa06 to your computer and use it in GitHub Desktop.
#include <string>
#include <unordered_set>
#include <algorithm>
#include <cctype>
class UDFNormalizeString {
public:
static const std::string DEFAULT_VALUE;
static const std::unordered_set<std::string> DEFAULT_NULL_VALUES;
private:
std::string defaultValue;
std::unordered_set<std::string> nullValues;
public:
UDFNormalizeString() : defaultValue(DEFAULT_VALUE), nullValues(DEFAULT_NULL_VALUES) {}
void initialize(const std::vector<std::string>& arguments) {
if (arguments.empty()) {
throw std::invalid_argument("norm_str() expects at least one argument.");
}
defaultValue = DEFAULT_VALUE;
if (arguments.size() >= 2) {
defaultValue = arguments[1];
}
nullValues = DEFAULT_NULL_VALUES;
for (size_t i = 2; i < arguments.size(); ++i) {
if (arguments[i].empty()) {
if (i != 2) {
throw std::invalid_argument("Only the third null argument will clear the default null values of norm_str().");
}
nullValues.clear();
} else {
nullValues.insert(toLower(arguments[i]));
}
}
}
std::string evaluate(const std::string& input) {
if (input.empty()) {
return defaultValue;
}
std::string trimmedInput = trim(input);
if (trimmedInput.empty() || nullValues.find(toLower(trimmedInput)) != nullValues.end()) {
return defaultValue;
}
return trimmedInput;
}
private:
static std::string toLower(const std::string& str) {
std::string lowerStr = str;
std::transform(lowerStr.begin(), lowerStr.end(), lowerStr.begin(),
[](unsigned char c){ return std::tolower(c); });
return lowerStr;
}
static std::string trim(const std::string& str) {
size_t first = str.find_first_not_of(' ');
if (std::string::npos == first) {
return "";
}
size_t last = str.find_last_not_of(' ');
return str.substr(first, (last - first + 1));
}
};
const std::string UDFNormalizeString::DEFAULT_VALUE = "N-A";
const std::unordered_set<std::string> UDFNormalizeString::DEFAULT_NULL_VALUES = {"null", "unknown", "unknow", UDFNormalizeString::DEFAULT_VALUE};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment