Created
August 10, 2017 12:57
-
-
Save rcurtin/0628ebb53f349ce60a6dc6099902005d to your computer and use it in GitHub Desktop.
DatasetMapper patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 6d16964d083a1af092f3ea25830487664c1c9cb3 Mon Sep 17 00:00:00 2001 | |
From: Ryan Curtin <ryan@ratml.org> | |
Date: Thu, 10 Aug 2017 08:55:12 -0400 | |
Subject: [PATCH] Refactor DatasetMapper to map any input type. | |
--- | |
src/mlpack/core/data/dataset_mapper.hpp | 51 ++++++------ | |
src/mlpack/core/data/dataset_mapper_impl.hpp | 93 +++++++++++----------- | |
.../core/data/map_policies/increment_policy.hpp | 63 +++++++++------ | |
src/mlpack/core/util/param.hpp | 4 +- | |
src/mlpack/core/util/param_data.hpp | 4 +- | |
src/mlpack/tests/imputation_test.cpp | 49 ++++++++++++ | |
6 files changed, 166 insertions(+), 98 deletions(-) | |
diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp | |
index 777fa3b15..91f1b1d5e 100644 | |
--- a/src/mlpack/core/data/dataset_mapper.hpp | |
+++ b/src/mlpack/core/data/dataset_mapper.hpp | |
@@ -23,16 +23,22 @@ | |
namespace mlpack { | |
namespace data { | |
+ | |
/** | |
- * Auxiliary information for a dataset, including mappings to/from strings and | |
- * the datatype of each dimension. DatasetMapper objects are optionally | |
- * produced by data::Load(), and store the type of each dimension | |
+ * Auxiliary information for a dataset, including mappings to/from strings (or | |
+ * other types) and the datatype of each dimension. DatasetMapper objects are | |
+ * optionally produced by data::Load(), and store the type of each dimension | |
* (Datatype::numeric or Datatype::categorical) as well as mappings from strings | |
* to unsigned integers and vice versa. | |
* | |
- * @tparam PolicyType Mapping policy used to specify MapString(); | |
+ * DatasetMapper objects can also map from arbitrary types; the type to map from | |
+ * can be specified with the InputType template parameter. By default, the | |
+ * InputType parameter is std::string. | |
+ * | |
+ * @tparam PolicyType Mapping policy used to specify MapString(). | |
+ * @tparam InputType Type of input to be mapped. | |
*/ | |
-template <typename PolicyType> | |
+template<typename PolicyType, typename InputType = std::string> | |
class DatasetMapper | |
{ | |
public: | |
@@ -51,50 +57,49 @@ class DatasetMapper | |
explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0); | |
/** | |
- * Preprocessing: during a first pass of the data, pass the strings on to the | |
+ * Preprocessing: during a first pass of the data, pass the input on to the | |
* MapPolicy if they are needed. | |
* | |
- * @param string String to map. | |
+ * @param input Input to map. | |
* @param dimension Dimension to map for. | |
*/ | |
template<typename T> | |
- void MapFirstPass(const std::string& string, const size_t dimension); | |
+ void MapFirstPass(const InputType& input, const size_t dimension); | |
/** | |
- * Given the string and the dimension to which it belongs, return its numeric | |
- * mapping. If no mapping yet exists, the string is added to the list of | |
+ * Given the input and the dimension to which it belongs, return its numeric | |
+ * mapping. If no mapping yet exists, the input is added to the list of | |
* mappings for the given dimension. The dimension parameter refers to the | |
* index of the dimension of the string (i.e. the row in the dataset). | |
* | |
* @tparam T Numeric type to map to (int/double/float/etc.). | |
- * @param string String to find/create mapping for. | |
+ * @param input Input to find/create mapping for. | |
* @param dimension Index of the dimension of the string. | |
*/ | |
template<typename T> | |
- T MapString(const std::string& string, | |
+ T MapString(const InputType& input, | |
const size_t dimension); | |
/** | |
- * Return the string that corresponds to a given value in a given dimension. | |
- * If the string is not a valid mapping in the given dimension, a | |
+ * Return the input that corresponds to a given value in a given dimension. | |
+ * If the value is not a valid mapping in the given dimension, a | |
* std::invalid_argument is thrown. | |
* | |
- * @param value Mapped value for string. | |
+ * @param value Mapped value for input. | |
* @param dimension Dimension to unmap string from. | |
*/ | |
- const std::string& UnmapString(const size_t value, const size_t dimension); | |
- | |
+ const InputType& UnmapString(const size_t value, const size_t dimension); | |
/** | |
- * Return the value that corresponds to a given string in a given dimension. | |
+ * Return the value that corresponds to a given input in a given dimension. | |
* If the value is not a valid mapping in the given dimension, a | |
* std::invalid_argument is thrown. | |
* | |
- * @param string Mapped string for value. | |
- * @param dimension Dimension to unmap string from. | |
+ * @param input Mapped input for value. | |
+ * @param dimension Dimension to unmap input from. | |
*/ | |
- typename PolicyType::MappedType UnmapValue(const std::string& string, | |
- const size_t dimension); | |
+ typename PolicyType::MappedType UnmapValue(const InputType& input, | |
+ const size_t dimension); | |
//! Return the type of a given dimension (numeric or categorical). | |
Datatype Type(const size_t dimension) const; | |
@@ -138,7 +143,7 @@ class DatasetMapper | |
std::vector<Datatype> types; | |
// BiMapType definition | |
- using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>; | |
+ using BiMapType = boost::bimap<InputType, typename PolicyType::MappedType>; | |
// Mappings from strings to integers. | |
// Map entries will only exist for dimensions that are categorical. | |
diff --git a/src/mlpack/core/data/dataset_mapper_impl.hpp b/src/mlpack/core/data/dataset_mapper_impl.hpp | |
index 48604e04d..821cbe2af 100644 | |
--- a/src/mlpack/core/data/dataset_mapper_impl.hpp | |
+++ b/src/mlpack/core/data/dataset_mapper_impl.hpp | |
@@ -20,15 +20,16 @@ namespace mlpack { | |
namespace data { | |
// Default constructor. | |
-template<typename PolicyType> | |
-inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) : | |
+template<typename PolicyType, typename InputType> | |
+inline DatasetMapper<PolicyType, InputType>::DatasetMapper( | |
+ const size_t dimensionality) : | |
types(dimensionality, Datatype::numeric) | |
{ | |
// Nothing to initialize here. | |
} | |
-template<typename PolicyType> | |
-inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy, | |
+template<typename PolicyType, typename InputType> | |
+inline DatasetMapper<PolicyType, InputType>::DatasetMapper(PolicyType& policy, | |
const size_t dimensionality) : | |
types(dimensionality, Datatype::numeric), | |
policy(std::move(policy)) | |
@@ -37,22 +38,22 @@ inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy, | |
} | |
// Utility helper function to call MapFirstPass. | |
-template<typename PolicyType, typename T> | |
+template<typename PolicyType, typename InputType, typename T> | |
void CallMapFirstPass( | |
PolicyType& policy, | |
- const std::string& string, | |
+ const InputType& input, | |
const size_t dimension, | |
std::vector<Datatype>& types, | |
const typename std::enable_if<PolicyType::NeedsFirstPass>::type* = 0) | |
{ | |
- policy.template MapFirstPass<T>(string, dimension, types); | |
+ policy.template MapFirstPass<T>(input, dimension, types); | |
} | |
// Utility helper function that doesn't call anything. | |
-template<typename PolicyType, typename T> | |
+template<typename PolicyType, typename InputType, typename T> | |
void CallMapFirstPass( | |
PolicyType& /* policy */, | |
- const std::string& /* string */, | |
+ const InputType& /* input */, | |
const size_t /* dimension */, | |
std::vector<Datatype>& /* types */, | |
const typename std::enable_if<!PolicyType::NeedsFirstPass>::type* = 0) | |
@@ -60,28 +61,29 @@ void CallMapFirstPass( | |
// Nothing to do here. | |
} | |
-template<typename PolicyType> | |
+template<typename PolicyType, typename InputType> | |
template<typename T> | |
-void DatasetMapper<PolicyType>::MapFirstPass(const std::string& string, | |
- const size_t dimension) | |
+void DatasetMapper<PolicyType, InputType>::MapFirstPass(const InputType& input, | |
+ const size_t dimension) | |
{ | |
// Call the correct overload (via SFINAE). | |
- CallMapFirstPass<PolicyType, T>(policy, string, dimension, types); | |
+ CallMapFirstPass<PolicyType, InputType, T>(policy, input, dimension, types); | |
} | |
// When we want to insert value into the map, we use the policy to map the | |
-// string. | |
-template<typename PolicyType> | |
+// input. | |
+template<typename PolicyType, typename InputType> | |
template<typename T> | |
-inline T DatasetMapper<PolicyType>::MapString(const std::string& string, | |
- const size_t dimension) | |
+inline T DatasetMapper<PolicyType, InputType>::MapString( | |
+ const InputType& input, | |
+ const size_t dimension) | |
{ | |
- return policy.template MapString<MapType, T>(string, dimension, maps, types); | |
+ return policy.template MapString<MapType, T>(input, dimension, maps, types); | |
} | |
-// Return the string corresponding to a value in a given dimension. | |
-template<typename PolicyType> | |
-inline const std::string& DatasetMapper<PolicyType>::UnmapString( | |
+// Return the input corresponding to a value in a given dimension. | |
+template<typename PolicyType, typename InputType> | |
+inline const InputType& DatasetMapper<PolicyType, InputType>::UnmapString( | |
const size_t value, | |
const size_t dimension) | |
{ | |
@@ -97,27 +99,29 @@ inline const std::string& DatasetMapper<PolicyType>::UnmapString( | |
return maps[dimension].first.right.at(value); | |
} | |
-// Return the value corresponding to a string in a given dimension. | |
-template<typename PolicyType> | |
-inline typename PolicyType::MappedType DatasetMapper<PolicyType>::UnmapValue( | |
- const std::string& string, | |
+// Return the value corresponding to an input in a given dimension. | |
+template<typename PolicyType, typename InputType> | |
+inline typename PolicyType::MappedType | |
+DatasetMapper<PolicyType, InputType>::UnmapValue( | |
+ const InputType& input, | |
const size_t dimension) | |
{ | |
// Throw an exception if the value doesn't exist. | |
- if (maps[dimension].first.left.count(string) == 0) | |
+ if (maps[dimension].first.left.count(input) == 0) | |
{ | |
std::ostringstream oss; | |
- oss << "DatasetMapper<PolicyType>::UnmapValue(): string '" << string | |
+ oss << "DatasetMapper<PolicyType>::UnmapValue(): input '" << input | |
<< "' unknown for dimension " << dimension; | |
throw std::invalid_argument(oss.str()); | |
} | |
- return maps[dimension].first.left.at(string); | |
+ return maps[dimension].first.left.at(input); | |
} | |
// Get the type of a particular dimension. | |
-template<typename PolicyType> | |
-inline Datatype DatasetMapper<PolicyType>::Type(const size_t dimension) const | |
+template<typename PolicyType, typename InputType> | |
+inline Datatype DatasetMapper<PolicyType, InputType>::Type( | |
+ const size_t dimension) const | |
{ | |
if (dimension >= types.size()) | |
{ | |
@@ -130,8 +134,9 @@ inline Datatype DatasetMapper<PolicyType>::Type(const size_t dimension) const | |
return types[dimension]; | |
} | |
-template<typename PolicyType> | |
-inline Datatype& DatasetMapper<PolicyType>::Type(const size_t dimension) | |
+template<typename PolicyType, typename InputType> | |
+inline Datatype& DatasetMapper<PolicyType, InputType>::Type( | |
+ const size_t dimension) | |
{ | |
if (dimension >= types.size()) | |
types.resize(dimension + 1, Datatype::numeric); | |
@@ -139,39 +144,37 @@ inline Datatype& DatasetMapper<PolicyType>::Type(const size_t dimension) | |
return types[dimension]; | |
} | |
-template<typename PolicyType> | |
-inline | |
-size_t DatasetMapper<PolicyType>::NumMappings(const size_t dimension) const | |
+template<typename PolicyType, typename InputType> | |
+inline size_t | |
+DatasetMapper<PolicyType, InputType>::NumMappings(const size_t dimension) const | |
{ | |
return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second; | |
} | |
-template<typename PolicyType> | |
-inline size_t DatasetMapper<PolicyType>::Dimensionality() const | |
+template<typename PolicyType, typename InputType> | |
+inline size_t DatasetMapper<PolicyType, InputType>::Dimensionality() const | |
{ | |
return types.size(); | |
} | |
-template<typename PolicyType> | |
-inline const PolicyType& DatasetMapper<PolicyType>::Policy() const | |
+template<typename PolicyType, typename InputType> | |
+inline const PolicyType& DatasetMapper<PolicyType, InputType>::Policy() const | |
{ | |
return this->policy; | |
} | |
-template<typename PolicyType> | |
-inline PolicyType& DatasetMapper<PolicyType>::Policy() | |
+template<typename PolicyType, typename InputType> | |
+inline PolicyType& DatasetMapper<PolicyType, InputType>::Policy() | |
{ | |
return this->policy; | |
} | |
-template<typename PolicyType> | |
-inline void DatasetMapper<PolicyType>::Policy(PolicyType&& policy) | |
+template<typename PolicyType, typename InputType> | |
+inline void DatasetMapper<PolicyType, InputType>::Policy(PolicyType&& policy) | |
{ | |
this->policy = std::forward<PolicyType>(policy); | |
} | |
- | |
- | |
} // namespace data | |
} // namespace mlpack | |
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp | |
index 4b7f50b26..ec4bda136 100644 | |
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp | |
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp | |
@@ -19,16 +19,24 @@ | |
namespace mlpack { | |
namespace data { | |
+ | |
/** | |
* IncrementPolicy is used as a helper class for DatasetMapper. It tells how the | |
* strings should be mapped. Purpose of this policy is to map all dimension if | |
* one if the variables in a dimension turns out to be a categorical variable. | |
* IncrementPolicy maps strings to incrementing unsigned integers (size_t). | |
- * The first string to be mapped will be mapped to 0, the next to 1 and so on. | |
+ * The first input to be mapped will be mapped to 0, the next to 1 and so on. | |
+ * | |
+ * If the 'forceAllMappings' parameter is set to true, this will always map. | |
+ * Otherwise, inputs will only be mapped if they cannot be cast to the output | |
+ * type via a stringstream extraction. | |
*/ | |
class IncrementPolicy | |
{ | |
public: | |
+ IncrementPolicy(const bool forceAllMappings = false) : | |
+ forceAllMappings(forceAllMappings) { } | |
+ | |
// typedef of MappedType | |
using MappedType = size_t; | |
@@ -38,8 +46,8 @@ class IncrementPolicy | |
/** | |
* Determine if the dimension is numeric or categorical. | |
*/ | |
- template<typename T> | |
- void MapFirstPass(const std::string& string, | |
+ template<typename T, typename InputType> | |
+ void MapFirstPass(const InputType& input, | |
const size_t dim, | |
std::vector<Datatype>& types) | |
{ | |
@@ -49,19 +57,21 @@ class IncrementPolicy | |
return; | |
} | |
- // Otherwise we need to attempt to read the value. If the read fails, the | |
- // dimension is categorical; otherwise we leave it at the default of | |
- // numeric. | |
- std::stringstream token; | |
- token.str(string); | |
- T val; | |
- token >> val; | |
- | |
- if (token.fail() || !token.eof()) | |
+ if (forceAllMappings) | |
{ | |
- // Parsing failed; the dimension is categorical. | |
types[dim] = Datatype::categorical; | |
} | |
+ else | |
+ { | |
+ // Attempt to convert the input to an output type via a stringstream. | |
+ std::stringstream token; | |
+ token << input; | |
+ T val; | |
+ token >> val; | |
+ | |
+ if (token.fail() || !token.eof()) | |
+ types[dim] = Datatype::categorical; | |
+ } | |
} | |
/** | |
@@ -77,37 +87,34 @@ class IncrementPolicy | |
* @param maps Unordered map given by the DatasetMapper. | |
* @param types Vector containing the type information about each dimensions. | |
*/ | |
- template<typename MapType, typename T> | |
- T MapString(const std::string& string, | |
+ template<typename MapType, typename T, typename InputType> | |
+ T MapString(const InputType& input, | |
const size_t dimension, | |
MapType& maps, | |
std::vector<Datatype>& types) | |
{ | |
// If we are in a categorical dimension we already know we need to map. | |
- if (types[dimension] == Datatype::numeric) | |
+ if (types[dimension] == Datatype::numeric && !forceAllMappings) | |
{ | |
// Check if this string needs to be mapped or if it can be read | |
// directly as a number. This will be true if nothing else in this | |
// dimension has yet been mapped, but this can't be read as a number. | |
std::stringstream token; | |
- token.str(string); | |
+ token << input; | |
T val; | |
token >> val; | |
if (!token.fail() && token.eof()) | |
- { | |
- // We can return what we have. | |
return val; | |
- } | |
- } | |
- // The token must be mapped. | |
+ // Otherwise, we must map. | |
+ } | |
// If this condition is true, either we have no mapping for the given string | |
// or we have no mappings for the given dimension at all. In either case, | |
// we create a mapping. | |
if (maps.count(dimension) == 0 || | |
- maps[dimension].first.left.count(string) == 0) | |
+ maps[dimension].first.left.count(input) == 0) | |
{ | |
// This string does not exist yet. | |
size_t& numMappings = maps[dimension].second; | |
@@ -116,16 +123,20 @@ class IncrementPolicy | |
if (numMappings == 0) | |
types[dimension] = Datatype::categorical; | |
- typedef boost::bimap<std::string, MappedType>::value_type PairType; | |
- maps[dimension].first.insert(PairType(string, numMappings)); | |
+ typedef typename boost::bimap<InputType, MappedType>::value_type PairType; | |
+ maps[dimension].first.insert(PairType(input, numMappings)); | |
return T(numMappings++); | |
} | |
else | |
{ | |
// This string already exists in the mapping. | |
- return maps[dimension].first.left.at(string); | |
+ return maps[dimension].first.left.at(input); | |
} | |
} | |
+ | |
+ private: | |
+ // Whether or not we should map all tokens. | |
+ bool forceAllMappings; | |
}; // class IncrementPolicy | |
} // namespace data | |
diff --git a/src/mlpack/core/util/param.hpp b/src/mlpack/core/util/param.hpp | |
index 0d08cb69f..c673a1daf 100644 | |
--- a/src/mlpack/core/util/param.hpp | |
+++ b/src/mlpack/core/util/param.hpp | |
@@ -21,10 +21,10 @@ namespace data { | |
class IncrementPolicy; | |
-template<typename PolicyType> | |
+template<typename PolicyType, typename InputType> | |
class DatasetMapper; | |
-using DatasetInfo = DatasetMapper<IncrementPolicy>; | |
+using DatasetInfo = DatasetMapper<IncrementPolicy, std::string>; | |
} // namespace data | |
} // namespace mlpack | |
diff --git a/src/mlpack/core/util/param_data.hpp b/src/mlpack/core/util/param_data.hpp | |
index 511f59e66..ef7b08b6e 100644 | |
--- a/src/mlpack/core/util/param_data.hpp | |
+++ b/src/mlpack/core/util/param_data.hpp | |
@@ -91,8 +91,8 @@ struct ParameterType<arma::Row<eT>> | |
* For matrix+dataset info types, we should accept a std::string. | |
*/ | |
template<typename eT, typename PolicyType> | |
-struct ParameterType<std::tuple<mlpack::data::DatasetMapper<PolicyType>, | |
- arma::Mat<eT>>> | |
+struct ParameterType<std::tuple<mlpack::data::DatasetMapper<PolicyType, | |
+ std::string>, arma::Mat<eT>>> | |
{ | |
typedef std::string type; | |
}; | |
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp | |
index 9fcd26a2d..f1ffd82cb 100644 | |
--- a/src/mlpack/tests/imputation_test.cpp | |
+++ b/src/mlpack/tests/imputation_test.cpp | |
@@ -16,6 +16,7 @@ | |
#include <mlpack/core/data/dataset_mapper.hpp> | |
#include <mlpack/core/data/map_policies/increment_policy.hpp> | |
#include <mlpack/core/data/map_policies/missing_policy.hpp> | |
+#include <mlpack/core/data/map_policies/map_all.hpp> | |
#include <mlpack/core/data/imputer.hpp> | |
#include <mlpack/core/data/imputation_methods/custom_imputation.hpp> | |
#include <mlpack/core/data/imputation_methods/listwise_deletion.hpp> | |
@@ -266,4 +267,52 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest) | |
BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 8.0, 1e-5); | |
} | |
+/** | |
+ * Make sure we can map non-strings. | |
+ */ | |
+BOOST_AUTO_TEST_CASE(DatasetMapperNonStringMapping) | |
+{ | |
+ IncrementPolicy incr(true); | |
+ DatasetMapper<IncrementPolicy, double> dm(incr, 1); | |
+ dm.MapString<size_t>(5.0, 0); | |
+ dm.MapString<size_t>(4.3, 0); | |
+ dm.MapString<size_t>(1.1, 0); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.NumMappings(0), 3); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.Type(0), data::Datatype::categorical); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(5.0, 0), 0); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(4.3, 0), 1); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(1.1, 0), 2); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(0, 0), 5.0); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(1, 0), 4.3); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(2, 0), 1.1); | |
+} | |
+ | |
+/** | |
+ * Make sure we can map strange types. | |
+ */ | |
+BOOST_AUTO_TEST_CASE(DatasetMapperPointerMapping) | |
+{ | |
+ int a = 1, b = 2, c = 3; | |
+ IncrementPolicy incr(true); | |
+ DatasetMapper<IncrementPolicy, int*> dm(incr, 1); | |
+ | |
+ dm.MapString<size_t>(&a, 0); | |
+ dm.MapString<size_t>(&b, 0); | |
+ dm.MapString<size_t>(&c, 0); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.NumMappings(0), 3); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(&a, 0), 0); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(&b, 0), 1); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(&c, 0), 2); | |
+ | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(0, 0), &a); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(1, 0), &b); | |
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(2, 0), &c); | |
+} | |
+ | |
BOOST_AUTO_TEST_SUITE_END(); | |
-- | |
2.11.0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment