Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rcurtin/0628ebb53f349ce60a6dc6099902005d to your computer and use it in GitHub Desktop.
Save rcurtin/0628ebb53f349ce60a6dc6099902005d to your computer and use it in GitHub Desktop.
DatasetMapper patch
From 6d16964d083a1af092f3ea25830487664c1c9cb3 Mon Sep 17 00:00:00 2001
From: Ryan Curtin <ryan@ratml.org>
Date: Thu, 10 Aug 2017 08:55:12 -0400
Subject: [PATCH] Refactor DatasetMapper to map any input type.
---
src/mlpack/core/data/dataset_mapper.hpp | 51 ++++++------
src/mlpack/core/data/dataset_mapper_impl.hpp | 93 +++++++++++-----------
.../core/data/map_policies/increment_policy.hpp | 63 +++++++++------
src/mlpack/core/util/param.hpp | 4 +-
src/mlpack/core/util/param_data.hpp | 4 +-
src/mlpack/tests/imputation_test.cpp | 49 ++++++++++++
6 files changed, 166 insertions(+), 98 deletions(-)
diff --git a/src/mlpack/core/data/dataset_mapper.hpp b/src/mlpack/core/data/dataset_mapper.hpp
index 777fa3b15..91f1b1d5e 100644
--- a/src/mlpack/core/data/dataset_mapper.hpp
+++ b/src/mlpack/core/data/dataset_mapper.hpp
@@ -23,16 +23,22 @@
namespace mlpack {
namespace data {
+
/**
- * Auxiliary information for a dataset, including mappings to/from strings and
- * the datatype of each dimension. DatasetMapper objects are optionally
- * produced by data::Load(), and store the type of each dimension
+ * Auxiliary information for a dataset, including mappings to/from strings (or
+ * other types) and the datatype of each dimension. DatasetMapper objects are
+ * optionally produced by data::Load(), and store the type of each dimension
* (Datatype::numeric or Datatype::categorical) as well as mappings from strings
* to unsigned integers and vice versa.
*
- * @tparam PolicyType Mapping policy used to specify MapString();
+ * DatasetMapper objects can also map from arbitrary types; the type to map from
+ * can be specified with the InputType template parameter. By default, the
+ * InputType parameter is std::string.
+ *
+ * @tparam PolicyType Mapping policy used to specify MapString().
+ * @tparam InputType Type of input to be mapped.
*/
-template <typename PolicyType>
+template<typename PolicyType, typename InputType = std::string>
class DatasetMapper
{
public:
@@ -51,50 +57,49 @@ class DatasetMapper
explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
/**
- * Preprocessing: during a first pass of the data, pass the strings on to the
+ * Preprocessing: during a first pass of the data, pass the input on to the
* MapPolicy if they are needed.
*
- * @param string String to map.
+ * @param input Input to map.
* @param dimension Dimension to map for.
*/
template<typename T>
- void MapFirstPass(const std::string& string, const size_t dimension);
+ void MapFirstPass(const InputType& input, const size_t dimension);
/**
- * Given the string and the dimension to which it belongs, return its numeric
- * mapping. If no mapping yet exists, the string is added to the list of
+ * Given the input and the dimension to which it belongs, return its numeric
+ * mapping. If no mapping yet exists, the input is added to the list of
* mappings for the given dimension. The dimension parameter refers to the
* index of the dimension of the string (i.e. the row in the dataset).
*
* @tparam T Numeric type to map to (int/double/float/etc.).
- * @param string String to find/create mapping for.
+ * @param input Input to find/create mapping for.
* @param dimension Index of the dimension of the string.
*/
template<typename T>
- T MapString(const std::string& string,
+ T MapString(const InputType& input,
const size_t dimension);
/**
- * Return the string that corresponds to a given value in a given dimension.
- * If the string is not a valid mapping in the given dimension, a
+ * Return the input that corresponds to a given value in a given dimension.
+ * If the value is not a valid mapping in the given dimension, a
* std::invalid_argument is thrown.
*
- * @param value Mapped value for string.
+ * @param value Mapped value for input.
* @param dimension Dimension to unmap string from.
*/
- const std::string& UnmapString(const size_t value, const size_t dimension);
-
+ const InputType& UnmapString(const size_t value, const size_t dimension);
/**
- * Return the value that corresponds to a given string in a given dimension.
+ * Return the value that corresponds to a given input in a given dimension.
* If the value is not a valid mapping in the given dimension, a
* std::invalid_argument is thrown.
*
- * @param string Mapped string for value.
- * @param dimension Dimension to unmap string from.
+ * @param input Mapped input for value.
+ * @param dimension Dimension to unmap input from.
*/
- typename PolicyType::MappedType UnmapValue(const std::string& string,
- const size_t dimension);
+ typename PolicyType::MappedType UnmapValue(const InputType& input,
+ const size_t dimension);
//! Return the type of a given dimension (numeric or categorical).
Datatype Type(const size_t dimension) const;
@@ -138,7 +143,7 @@ class DatasetMapper
std::vector<Datatype> types;
// BiMapType definition
- using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
+ using BiMapType = boost::bimap<InputType, typename PolicyType::MappedType>;
// Mappings from strings to integers.
// Map entries will only exist for dimensions that are categorical.
diff --git a/src/mlpack/core/data/dataset_mapper_impl.hpp b/src/mlpack/core/data/dataset_mapper_impl.hpp
index 48604e04d..821cbe2af 100644
--- a/src/mlpack/core/data/dataset_mapper_impl.hpp
+++ b/src/mlpack/core/data/dataset_mapper_impl.hpp
@@ -20,15 +20,16 @@ namespace mlpack {
namespace data {
// Default constructor.
-template<typename PolicyType>
-inline DatasetMapper<PolicyType>::DatasetMapper(const size_t dimensionality) :
+template<typename PolicyType, typename InputType>
+inline DatasetMapper<PolicyType, InputType>::DatasetMapper(
+ const size_t dimensionality) :
types(dimensionality, Datatype::numeric)
{
// Nothing to initialize here.
}
-template<typename PolicyType>
-inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
+template<typename PolicyType, typename InputType>
+inline DatasetMapper<PolicyType, InputType>::DatasetMapper(PolicyType& policy,
const size_t dimensionality) :
types(dimensionality, Datatype::numeric),
policy(std::move(policy))
@@ -37,22 +38,22 @@ inline DatasetMapper<PolicyType>::DatasetMapper(PolicyType& policy,
}
// Utility helper function to call MapFirstPass.
-template<typename PolicyType, typename T>
+template<typename PolicyType, typename InputType, typename T>
void CallMapFirstPass(
PolicyType& policy,
- const std::string& string,
+ const InputType& input,
const size_t dimension,
std::vector<Datatype>& types,
const typename std::enable_if<PolicyType::NeedsFirstPass>::type* = 0)
{
- policy.template MapFirstPass<T>(string, dimension, types);
+ policy.template MapFirstPass<T>(input, dimension, types);
}
// Utility helper function that doesn't call anything.
-template<typename PolicyType, typename T>
+template<typename PolicyType, typename InputType, typename T>
void CallMapFirstPass(
PolicyType& /* policy */,
- const std::string& /* string */,
+ const InputType& /* input */,
const size_t /* dimension */,
std::vector<Datatype>& /* types */,
const typename std::enable_if<!PolicyType::NeedsFirstPass>::type* = 0)
@@ -60,28 +61,29 @@ void CallMapFirstPass(
// Nothing to do here.
}
-template<typename PolicyType>
+template<typename PolicyType, typename InputType>
template<typename T>
-void DatasetMapper<PolicyType>::MapFirstPass(const std::string& string,
- const size_t dimension)
+void DatasetMapper<PolicyType, InputType>::MapFirstPass(const InputType& input,
+ const size_t dimension)
{
// Call the correct overload (via SFINAE).
- CallMapFirstPass<PolicyType, T>(policy, string, dimension, types);
+ CallMapFirstPass<PolicyType, InputType, T>(policy, input, dimension, types);
}
// When we want to insert value into the map, we use the policy to map the
-// string.
-template<typename PolicyType>
+// input.
+template<typename PolicyType, typename InputType>
template<typename T>
-inline T DatasetMapper<PolicyType>::MapString(const std::string& string,
- const size_t dimension)
+inline T DatasetMapper<PolicyType, InputType>::MapString(
+ const InputType& input,
+ const size_t dimension)
{
- return policy.template MapString<MapType, T>(string, dimension, maps, types);
+ return policy.template MapString<MapType, T>(input, dimension, maps, types);
}
-// Return the string corresponding to a value in a given dimension.
-template<typename PolicyType>
-inline const std::string& DatasetMapper<PolicyType>::UnmapString(
+// Return the input corresponding to a value in a given dimension.
+template<typename PolicyType, typename InputType>
+inline const InputType& DatasetMapper<PolicyType, InputType>::UnmapString(
const size_t value,
const size_t dimension)
{
@@ -97,27 +99,29 @@ inline const std::string& DatasetMapper<PolicyType>::UnmapString(
return maps[dimension].first.right.at(value);
}
-// Return the value corresponding to a string in a given dimension.
-template<typename PolicyType>
-inline typename PolicyType::MappedType DatasetMapper<PolicyType>::UnmapValue(
- const std::string& string,
+// Return the value corresponding to an input in a given dimension.
+template<typename PolicyType, typename InputType>
+inline typename PolicyType::MappedType
+DatasetMapper<PolicyType, InputType>::UnmapValue(
+ const InputType& input,
const size_t dimension)
{
// Throw an exception if the value doesn't exist.
- if (maps[dimension].first.left.count(string) == 0)
+ if (maps[dimension].first.left.count(input) == 0)
{
std::ostringstream oss;
- oss << "DatasetMapper<PolicyType>::UnmapValue(): string '" << string
+ oss << "DatasetMapper<PolicyType>::UnmapValue(): input '" << input
<< "' unknown for dimension " << dimension;
throw std::invalid_argument(oss.str());
}
- return maps[dimension].first.left.at(string);
+ return maps[dimension].first.left.at(input);
}
// Get the type of a particular dimension.
-template<typename PolicyType>
-inline Datatype DatasetMapper<PolicyType>::Type(const size_t dimension) const
+template<typename PolicyType, typename InputType>
+inline Datatype DatasetMapper<PolicyType, InputType>::Type(
+ const size_t dimension) const
{
if (dimension >= types.size())
{
@@ -130,8 +134,9 @@ inline Datatype DatasetMapper<PolicyType>::Type(const size_t dimension) const
return types[dimension];
}
-template<typename PolicyType>
-inline Datatype& DatasetMapper<PolicyType>::Type(const size_t dimension)
+template<typename PolicyType, typename InputType>
+inline Datatype& DatasetMapper<PolicyType, InputType>::Type(
+ const size_t dimension)
{
if (dimension >= types.size())
types.resize(dimension + 1, Datatype::numeric);
@@ -139,39 +144,37 @@ inline Datatype& DatasetMapper<PolicyType>::Type(const size_t dimension)
return types[dimension];
}
-template<typename PolicyType>
-inline
-size_t DatasetMapper<PolicyType>::NumMappings(const size_t dimension) const
+template<typename PolicyType, typename InputType>
+inline size_t
+DatasetMapper<PolicyType, InputType>::NumMappings(const size_t dimension) const
{
return (maps.count(dimension) == 0) ? 0 : maps.at(dimension).second;
}
-template<typename PolicyType>
-inline size_t DatasetMapper<PolicyType>::Dimensionality() const
+template<typename PolicyType, typename InputType>
+inline size_t DatasetMapper<PolicyType, InputType>::Dimensionality() const
{
return types.size();
}
-template<typename PolicyType>
-inline const PolicyType& DatasetMapper<PolicyType>::Policy() const
+template<typename PolicyType, typename InputType>
+inline const PolicyType& DatasetMapper<PolicyType, InputType>::Policy() const
{
return this->policy;
}
-template<typename PolicyType>
-inline PolicyType& DatasetMapper<PolicyType>::Policy()
+template<typename PolicyType, typename InputType>
+inline PolicyType& DatasetMapper<PolicyType, InputType>::Policy()
{
return this->policy;
}
-template<typename PolicyType>
-inline void DatasetMapper<PolicyType>::Policy(PolicyType&& policy)
+template<typename PolicyType, typename InputType>
+inline void DatasetMapper<PolicyType, InputType>::Policy(PolicyType&& policy)
{
this->policy = std::forward<PolicyType>(policy);
}
-
-
} // namespace data
} // namespace mlpack
diff --git a/src/mlpack/core/data/map_policies/increment_policy.hpp b/src/mlpack/core/data/map_policies/increment_policy.hpp
index 4b7f50b26..ec4bda136 100644
--- a/src/mlpack/core/data/map_policies/increment_policy.hpp
+++ b/src/mlpack/core/data/map_policies/increment_policy.hpp
@@ -19,16 +19,24 @@
namespace mlpack {
namespace data {
+
/**
* IncrementPolicy is used as a helper class for DatasetMapper. It tells how the
* strings should be mapped. Purpose of this policy is to map all dimension if
* one if the variables in a dimension turns out to be a categorical variable.
* IncrementPolicy maps strings to incrementing unsigned integers (size_t).
- * The first string to be mapped will be mapped to 0, the next to 1 and so on.
+ * The first input to be mapped will be mapped to 0, the next to 1 and so on.
+ *
+ * If the 'forceAllMappings' parameter is set to true, this will always map.
+ * Otherwise, inputs will only be mapped if they cannot be cast to the output
+ * type via a stringstream extraction.
*/
class IncrementPolicy
{
public:
+ IncrementPolicy(const bool forceAllMappings = false) :
+ forceAllMappings(forceAllMappings) { }
+
// typedef of MappedType
using MappedType = size_t;
@@ -38,8 +46,8 @@ class IncrementPolicy
/**
* Determine if the dimension is numeric or categorical.
*/
- template<typename T>
- void MapFirstPass(const std::string& string,
+ template<typename T, typename InputType>
+ void MapFirstPass(const InputType& input,
const size_t dim,
std::vector<Datatype>& types)
{
@@ -49,19 +57,21 @@ class IncrementPolicy
return;
}
- // Otherwise we need to attempt to read the value. If the read fails, the
- // dimension is categorical; otherwise we leave it at the default of
- // numeric.
- std::stringstream token;
- token.str(string);
- T val;
- token >> val;
-
- if (token.fail() || !token.eof())
+ if (forceAllMappings)
{
- // Parsing failed; the dimension is categorical.
types[dim] = Datatype::categorical;
}
+ else
+ {
+ // Attempt to convert the input to an output type via a stringstream.
+ std::stringstream token;
+ token << input;
+ T val;
+ token >> val;
+
+ if (token.fail() || !token.eof())
+ types[dim] = Datatype::categorical;
+ }
}
/**
@@ -77,37 +87,34 @@ class IncrementPolicy
* @param maps Unordered map given by the DatasetMapper.
* @param types Vector containing the type information about each dimensions.
*/
- template<typename MapType, typename T>
- T MapString(const std::string& string,
+ template<typename MapType, typename T, typename InputType>
+ T MapString(const InputType& input,
const size_t dimension,
MapType& maps,
std::vector<Datatype>& types)
{
// If we are in a categorical dimension we already know we need to map.
- if (types[dimension] == Datatype::numeric)
+ if (types[dimension] == Datatype::numeric && !forceAllMappings)
{
// Check if this string needs to be mapped or if it can be read
// directly as a number. This will be true if nothing else in this
// dimension has yet been mapped, but this can't be read as a number.
std::stringstream token;
- token.str(string);
+ token << input;
T val;
token >> val;
if (!token.fail() && token.eof())
- {
- // We can return what we have.
return val;
- }
- }
- // The token must be mapped.
+ // Otherwise, we must map.
+ }
// If this condition is true, either we have no mapping for the given string
// or we have no mappings for the given dimension at all. In either case,
// we create a mapping.
if (maps.count(dimension) == 0 ||
- maps[dimension].first.left.count(string) == 0)
+ maps[dimension].first.left.count(input) == 0)
{
// This string does not exist yet.
size_t& numMappings = maps[dimension].second;
@@ -116,16 +123,20 @@ class IncrementPolicy
if (numMappings == 0)
types[dimension] = Datatype::categorical;
- typedef boost::bimap<std::string, MappedType>::value_type PairType;
- maps[dimension].first.insert(PairType(string, numMappings));
+ typedef typename boost::bimap<InputType, MappedType>::value_type PairType;
+ maps[dimension].first.insert(PairType(input, numMappings));
return T(numMappings++);
}
else
{
// This string already exists in the mapping.
- return maps[dimension].first.left.at(string);
+ return maps[dimension].first.left.at(input);
}
}
+
+ private:
+ // Whether or not we should map all tokens.
+ bool forceAllMappings;
}; // class IncrementPolicy
} // namespace data
diff --git a/src/mlpack/core/util/param.hpp b/src/mlpack/core/util/param.hpp
index 0d08cb69f..c673a1daf 100644
--- a/src/mlpack/core/util/param.hpp
+++ b/src/mlpack/core/util/param.hpp
@@ -21,10 +21,10 @@ namespace data {
class IncrementPolicy;
-template<typename PolicyType>
+template<typename PolicyType, typename InputType>
class DatasetMapper;
-using DatasetInfo = DatasetMapper<IncrementPolicy>;
+using DatasetInfo = DatasetMapper<IncrementPolicy, std::string>;
} // namespace data
} // namespace mlpack
diff --git a/src/mlpack/core/util/param_data.hpp b/src/mlpack/core/util/param_data.hpp
index 511f59e66..ef7b08b6e 100644
--- a/src/mlpack/core/util/param_data.hpp
+++ b/src/mlpack/core/util/param_data.hpp
@@ -91,8 +91,8 @@ struct ParameterType<arma::Row<eT>>
* For matrix+dataset info types, we should accept a std::string.
*/
template<typename eT, typename PolicyType>
-struct ParameterType<std::tuple<mlpack::data::DatasetMapper<PolicyType>,
- arma::Mat<eT>>>
+struct ParameterType<std::tuple<mlpack::data::DatasetMapper<PolicyType,
+ std::string>, arma::Mat<eT>>>
{
typedef std::string type;
};
diff --git a/src/mlpack/tests/imputation_test.cpp b/src/mlpack/tests/imputation_test.cpp
index 9fcd26a2d..f1ffd82cb 100644
--- a/src/mlpack/tests/imputation_test.cpp
+++ b/src/mlpack/tests/imputation_test.cpp
@@ -16,6 +16,7 @@
#include <mlpack/core/data/dataset_mapper.hpp>
#include <mlpack/core/data/map_policies/increment_policy.hpp>
#include <mlpack/core/data/map_policies/missing_policy.hpp>
+#include <mlpack/core/data/map_policies/map_all.hpp>
#include <mlpack/core/data/imputer.hpp>
#include <mlpack/core/data/imputation_methods/custom_imputation.hpp>
#include <mlpack/core/data/imputation_methods/listwise_deletion.hpp>
@@ -266,4 +267,52 @@ BOOST_AUTO_TEST_CASE(ListwiseDeletionTest)
BOOST_REQUIRE_CLOSE(rowWiseInput(1, 3), 8.0, 1e-5);
}
+/**
+ * Make sure we can map non-strings.
+ */
+BOOST_AUTO_TEST_CASE(DatasetMapperNonStringMapping)
+{
+ IncrementPolicy incr(true);
+ DatasetMapper<IncrementPolicy, double> dm(incr, 1);
+ dm.MapString<size_t>(5.0, 0);
+ dm.MapString<size_t>(4.3, 0);
+ dm.MapString<size_t>(1.1, 0);
+
+ BOOST_REQUIRE_EQUAL(dm.NumMappings(0), 3);
+
+ BOOST_REQUIRE_EQUAL(dm.Type(0), data::Datatype::categorical);
+
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(5.0, 0), 0);
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(4.3, 0), 1);
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(1.1, 0), 2);
+
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(0, 0), 5.0);
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(1, 0), 4.3);
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(2, 0), 1.1);
+}
+
+/**
+ * Make sure we can map strange types.
+ */
+BOOST_AUTO_TEST_CASE(DatasetMapperPointerMapping)
+{
+ int a = 1, b = 2, c = 3;
+ IncrementPolicy incr(true);
+ DatasetMapper<IncrementPolicy, int*> dm(incr, 1);
+
+ dm.MapString<size_t>(&a, 0);
+ dm.MapString<size_t>(&b, 0);
+ dm.MapString<size_t>(&c, 0);
+
+ BOOST_REQUIRE_EQUAL(dm.NumMappings(0), 3);
+
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(&a, 0), 0);
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(&b, 0), 1);
+ BOOST_REQUIRE_EQUAL(dm.UnmapValue(&c, 0), 2);
+
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(0, 0), &a);
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(1, 0), &b);
+ BOOST_REQUIRE_EQUAL(dm.UnmapString(2, 0), &c);
+}
+
BOOST_AUTO_TEST_SUITE_END();
--
2.11.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment