Skip to content

Instantly share code, notes, and snippets.

@Diggsey
Created May 25, 2015 00:23
Show Gist options
  • Save Diggsey/973e062a52c9f1c7078b to your computer and use it in GitHub Desktop.
Save Diggsey/973e062a52c9f1c7078b to your computer and use it in GitHub Desktop.
// AmpTest.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <boost/preprocessor/repeat.hpp>
#include <boost/preprocessor/enum.hpp>
#include <boost/preprocessor/enum_params.hpp>
#include <boost/range/counting_range.hpp>
#include <boost/range/adaptor/reversed.hpp>
using namespace concurrency;
template<typename T, int N> struct fixed_array;
#define ENUM_ARRAY_INDEX_M(z, n, data) data[n]
#define ENUM_ARRAY_INDEX(count, data) BOOST_PP_ENUM(count, ENUM_ARRAY_INDEX_M, data)
#define ENUM_FIELD_INIT_M(z, n, data) m_ ## data ## n (data ## n)
#define ENUM_FIELD_INIT(count, data) BOOST_PP_ENUM(count, ENUM_FIELD_INIT_M, data)
#define ENUM_FIELD_DECL_M(z, n, data) data ## n;
#define ENUM_FIELD_DECL(count, data) BOOST_PP_REPEAT(count, ENUM_FIELD_DECL_M, data)
#define ENUM_CASE_M(z, n, data) case n: return data ## n;
#define ENUM_CASE(count, data) BOOST_PP_REPEAT(count, ENUM_CASE_M, data)
#define DEF_SPEC_ARRAY(z, n, data) \
template<typename T> struct fixed_array<T, n> { \
public: \
ENUM_FIELD_DECL(n, T m_elem); \
\
inline fixed_array(BOOST_PP_ENUM_PARAMS(n, T const& elem)) restrict(amp, cpu) : ENUM_FIELD_INIT(n, elem) {}; \
inline T const& operator[](int i) const restrict(amp, cpu) { \
switch (i) { \
ENUM_CASE(n, m_elem) \
default: return m_elem0; \
} \
} \
};
BOOST_PP_REPEAT(16, DEF_SPEC_ARRAY, ());
template<unsigned N> class range_to;
const int uniform_param = 0x80000000;
template<unsigned N, int I> class param_indexer {
public:
template<typename T> static auto& index(T& param) restrict(amp, cpu) {
return param[N + I];
}
};
template<unsigned N> class param_indexer<N, uniform_param> {
public:
template<typename T> static T& index(T& param) restrict(amp, cpu) {
return param;
}
};
template<typename F>
class range_invoker {
public:
F f;
template<int... I, unsigned N, typename... P> inline void each(range_to<N> unused, P&... p) restrict(amp, cpu) {
each<I...>(range_to<N - 1>(), p...);
f(N - 1, param_indexer<N - 1, I>::index(p)...);
}
template<int... I, unsigned N, typename... P> inline void eachRev(range_to<N> unused, P&... p) restrict(amp, cpu) {
f(N - 1, param_indexer<N - 1, I>::index(p)...);
eachRev<I...>(range_to<N - 1>(), p...);
}
template<int... I, typename... P> inline void each(range_to<0> unused, P&... p) restrict(amp, cpu) { }
template<int... I, typename... P> inline void eachRev(range_to<0> unused, P&... p) restrict(amp, cpu) { }
};
template<unsigned N> class range_to {
public:
template<int... I, typename F, typename... P> static void each(F f, P&... p) restrict(amp, cpu) {
range_invoker<F> invoker{ f };
invoker.each<I...>(range_to<N>(), p...);
}
template<int... I, typename F, typename... P> static void eachRev(F f, P&... p) restrict(amp, cpu) {
range_invoker<F> invoker{ f };
invoker.eachRev<I...>(range_to<N>(), p...);
}
};
template<typename T, int Rank>
void fill(array<T, Rank>& arr, T initValue) {
parallel_for_each(arr.extent, [&arr, initValue](index<Rank> idx) restrict(amp) {
arr[idx] = initValue;
});
}
void printArray(array_view<float, 1> view) {
view.refresh();
array_view<float, 1> temp(view.extent);
view.copy_to(temp);
for (int i = 0; i < temp.extent[0]; ++i)
std::cout << std::setw(8) << temp[i];
std::cout << std::endl;
}
template<int N = 1>
struct table {
array<float, N> m_value;
array<float, N> m_gradient;
inline table(extent<N> size) : m_value(size), m_gradient(size) { }
inline extent<N> extent() const { return m_value.extent; }
};
template<int N = 1>
struct table_view {
array_view<float, N> m_value;
array_view<float, N> m_gradient;
inline table_view() : m_value(extent<N>()), m_gradient(extent<N>()) { }
inline table_view(table<N>& src) : m_value(src.m_value), m_gradient(src.m_gradient) { }
inline extent<N> extent() const { return m_value.extent; }
template<int M>
table_view<M> view_as(concurrency::extent<M> extent) {
return table_view<M>{ m_value.view_as(extent), m_gradient.view_as(extent) };
}
table_view<N> section(concurrency::index<N> index, concurrency::extent<N> extent) {
return table_view<M>{ m_value.section(index, extent), m_gradient.section(index, extent) };
}
};
class network;
class module {
protected:
network* m_network;
inline module(network* nn) : m_network(nn) { }
public:
virtual void updateOutput() = 0;
virtual void updateGradInput() = 0;
};
template<int N, typename S>
class module_function : public module {
protected:
table<> m_output;
fixed_array<table_view<>, N> m_inputs;
public:
template<typename... T>
inline module_function(network* nn, T... inputs) : module(nn), m_output(S::extent({ inputs... })), m_inputs(inputs...) { }
virtual void updateOutput() {
auto inputs = m_inputs;
table_view<> output = m_output;
try {
// Update outputs
output.m_value.discard_data();
parallel_for_each(
output.extent(),
[=](index<1> idx) restrict(amp) {
S::forward(idx, output, inputs);
}
);
if (m_network->getIsLearning()) {
// Clear gradients
for (int i = 0; i < N; ++i)
inputs[i].m_gradient.discard_data();
parallel_for_each(
output.extent(),
[=](index<1> idx) restrict(amp) {
range_to<N>::each<0>([=](int i, auto& inputi) restrict(amp) {
inputi.m_gradient[idx] = 0.0f;
}, inputs);
}
);
}
} catch (concurrency::runtime_exception& ex) {
OutputDebugStringA(ex.what());
DebugBreak();
}
}
virtual void updateGradInput() {
auto inputs = m_inputs;
table_view<> output = m_output;
try {
parallel_for_each(
output.extent(),
[=](index<1> idx) restrict(amp) {
S::backward(idx, output, inputs);
}
);
} catch (concurrency::runtime_exception& ex) {
OutputDebugStringA(ex.what());
DebugBreak();
}
}
inline table_view<> getOutput() {
return table_view<>(m_output);
}
};
// Base class for modules of the form `foldl <op> [inputs...]`,
// which includes scalar arithmetic
template<int N, typename S>
class module_scalar : public module_function<N, S> {
public:
using module_function::module_function;
static inline extent<1> extent(std::initializer_list<table_view<>> inputs) {
auto extent = inputs.begin()->extent();
for (auto& input : inputs) {
if (input.extent() != extent)
throw "Extent mismatch";
}
return extent;
}
template<bool unused = false>
static inline void forward(index<1> idx, table_view<> const& output, fixed_array<table_view<>, N> const& inputs) restrict(amp) {
// Default implementation of `forward` is essentially `foldl <op> [inputs...]`
float acc = S::identity();
range_to<N>::each<uniform_param, 0>([=](int i, float& acc, auto& inputi) restrict(amp) {
acc = S::op(acc, inputi.m_value[idx]);
}, acc, inputs);
output.m_value[idx] = acc;
}
template<bool unused = false>
static inline void backward(index<1> idx, table_view<> const& output, fixed_array<table_view<>, N> const& inputs) restrict(amp) {
// This is a fun little snippet of code which calculates all N partial
// derivatives of `foldl <op> [inputs...]` in O(N) time
// We can't use normal loops because AMP is funny about them...
float x[N];
range_to<N>::each<0, 0>([=](int i, float& xi, auto& inputi) restrict(amp) {
xi = inputi.m_value[idx];
}, x, inputs);
float acc[N];
acc[0] = S::identity();
range_to<N - 1>::each<1, 0, 0>([=](int i, float& acci1, float& acci, auto& xi) restrict(amp) {
acci1 = S::op(acci, xi);
}, acc, acc, x);
float dacc0[N];
dacc0[N - 1] = 1.0f;
range_to<N - 1>::eachRev<0, 1, 1>([=](int i, float& dacc0i, float& acci, auto& xi) restrict(amp) {
dacc0i = S::dop0(acci, xi);
}, dacc0, acc, x);
float y[N];
range_to<N - 1>::eachRev<0, 0, 0, 0>([=](int i, float& yi, float& acci, auto& xi, float& dacc0i) restrict(amp) {
float dacc1 = S::dop1(acci, xi);
yi = dacc1*dacc0i;
}, y, acc, x, dacc0);
float gradient = output.m_gradient[idx];
range_to<N>::each<0, 0>([=](int i, auto& inputi, float& yi) restrict(amp) {
inputi.m_gradient[idx] += gradient*yi;
}, inputs, y);
}
};
template<typename S>
class module_unary : public module_scalar<1, S> {
public:
using module_scalar::module_scalar;
static inline void forward(index<1> idx, table_view<> const& output, fixed_array<table_view<>, 1> const& inputs) restrict(amp) {
output.m_value[idx] = S::op(inputs[0].m_value[idx]);
}
static inline void backward(index<1> idx, table_view<> const& output, fixed_array<table_view<>, 1> const& inputs) restrict(amp) {
inputs[0].m_gradient[idx] += output.m_gradient[idx] * S::dop(output.m_value[idx], inputs[0].m_value[idx]);
}
};
template<int N = 2>
class module_add : public module_scalar<N, module_add<N>> {
public:
using module_scalar::module_scalar;
static inline float identity() restrict(amp) {
return 0.0f;
}
static inline float op(float a, float b) restrict(amp) {
return a+b;
}
static inline float dop0(float a, float b) restrict(amp) {
return 1.0f;
}
static inline float dop1(float a, float b) restrict(amp) {
return 1.0f;
}
};
class module_sub : public module_scalar<2, module_sub> {
public:
using module_scalar::module_scalar;
static inline float identity() restrict(amp) {
return 0.0f;
}
static inline float op(float a, float b) restrict(amp) {
return -(a + b);
}
static inline float dop0(float a, float b) restrict(amp) {
return -1.0f;
}
static inline float dop1(float a, float b) restrict(amp) {
return -1.0f;
}
};
class module_neg : public module_unary<module_neg> {
public:
using module_unary::module_unary;
static inline float op(float input) restrict(amp) {
return -input;
}
static inline float dop(float output, float input) restrict(amp) {
return -1.0f;
}
};
template<int N = 2>
class module_mul : public module_scalar<N, module_mul<N>> {
public:
using module_scalar::module_scalar;
static inline float identity() restrict(amp) {
return 1.0f;
}
static inline float op(float a, float b) restrict(amp) {
return a*b;
}
static inline float dop0(float a, float b) restrict(amp) {
return b;
}
static inline float dop1(float a, float b) restrict(amp) {
return a;
}
};
class module_div : public module_scalar<2, module_div> {
public:
using module_scalar::module_scalar;
static inline float identity() restrict(amp) {
return 1.0f;
}
static inline float op(float a, float b) restrict(amp) {
return 1.0f / (a*b);
}
static inline float dop0(float a, float b) restrict(amp) {
return -1.0f / (a*a*b);
}
static inline float dop1(float a, float b) restrict(amp) {
return -1.0f / (a*b*b);
}
};
class module_rcp : public module_unary<module_rcp> {
public:
using module_unary::module_unary;
static inline float op(float input) restrict(amp) {
return 1.0f / input;
}
static inline float dop(float output, float input) restrict(amp) {
return -1.0f / (input*input);
}
};
class module_sigmoid : public module_unary<module_sigmoid> {
public:
using module_unary::module_unary;
static inline float op(float input) restrict(amp) {
return 1.0f / (1.0f + concurrency::fast_math::exp(-input));
}
static inline float dop(float output, float input) restrict(amp) {
return output*(1.0f - output);
}
};
class module_tanh : public module_unary<module_tanh> {
public:
using module_unary::module_unary;
static inline float op(float input) restrict(amp) {
return concurrency::fast_math::tanh(input);
}
static inline float dop(float output, float input) restrict(amp) {
return 1.0f - output*output;
}
};
class module_input : public module {
protected:
table<> m_output;
public:
inline module_input(network* nn, extent<1> extent) : module(nn), m_output(extent) { }
virtual void updateOutput() {
}
virtual void updateGradInput() {
}
void setValue(array_view<float,1> value) {
if (value.extent != m_output.extent())
throw "Extent mismatch";
value.copy_to(m_output.m_value);
}
inline table_view<> getOutput() {
return table_view<>(m_output);
}
};
template<int N, typename S>
class module_container : public module {
protected:
fixed_array<table_view<>, N> m_outputs;
public:
template<typename... P>
inline module_container(network* nn, P&&... p) : module(nn), m_outputs(S::build(nn, std::forward<P>(p)...)) { }
virtual void updateOutput() {
}
virtual void updateGradInput() {
}
};
class module_lstm : public module_container<1, module_lstm> {
public:
using module_container::module_container;
static fixed_array<table_view<>, 1> build(network* nn) {
}
};
enum tensor_type {
tensor_type_transient = 0,
tensor_type_state,
tensor_type_weight,
tensor_type_count
};
class tensor_base {
public:
virtual unsigned size() = 0;
virtual void setSource(table_view<1> source) = 0;
};
template<int N = 1> class tensor : public tensor_base {
private:
table_view<N> m_data;
extent<N> m_extent;
public:
tensor(network* nn, tensor_type type, extent<N> extent) : m_data(), m_extent(extent) {
nn->registerTensor(type, this);
}
virtual unsigned size() {
return m_extent.size();
}
virtual void setSource(table_view<1> source) {
m_data = source.view_as(m_extent);
}
inline extent<N> extent() {
return m_extent;
}
inline table_view<N> view() {
return m_data;
}
};
class linear_tensor_allocator {
private:
std::vector<tensor_base*> m_tensors;
std::unique_ptr<table<1>> m_table;
public:
void registerTensor(tensor_base* tensor) {
m_tensors.push_back(tensor);
}
void allocate() {
unsigned totalSize = 0;
for (auto tensor : m_tensors) {
totalSize += tensor->size();
}
m_table = std::make_unique<table<1>>(extent<1>(totalSize));
table_view<1> tableView(*m_table);
unsigned offset = 0;
for (auto tensor : m_tensors) {
unsigned size = tensor->size();
tensor->setSource(tableView.section(index<1>(offset), extent<1>(size)));
offset += size;
}
}
};
class network {
private:
std::vector<std::unique_ptr<module>> m_moduleSeq;
bool m_isLearning;
linear_tensor_allocator m_tensorAllocators[tensor_type_count];
template<typename T>
inline T* addModule(std::unique_ptr<T>&& m) {
T* result = m.get();
m_moduleSeq.push_back(std::move(m));
return result;
}
public:
inline network() : m_isLearning(true) {
}
inline bool getIsLearning() {
return m_isLearning;
}
inline void setIsLearning(bool isLearning) {
m_isLearning = isLearning;
}
template<typename T, typename... P> inline T* make(P&&... args) {
return addModule(std::make_unique<T>(this, std::forward<P>(args)...));
}
void updateOutput() {
for (auto& module : m_moduleSeq)
module->updateOutput();
}
void updateGradInput() {
for (auto& module : boost::adaptors::reverse(m_moduleSeq))
module->updateGradInput();
}
void compile() {
for (int i = 0; i < tensor_type_count; ++i)
m_tensorAllocators[i].allocate();
}
inline void registerTensor(tensor_type type, tensor_base* tensor) {
m_tensorAllocators[type].registerTensor(tensor);
}
};
struct rms_config {
float learningRate = 1e-2f;
float alpha = 0.99f;
float epsilon = 1e-8f;
};
float lerp(float alpha, float a, float b) restrict(amp) {
return alpha*b + (1.0f - alpha)*a;
}
template<typename F>
class rms_prop {
private:
rms_config m_config;
F m_f;
array_view<float, 1>& m_x;
array<float, 1> m_state;
array<float, 1> m_loss;
public:
rms_prop(F f, array_view<float, 1>& x, rms_config const& config) : m_f(f), m_x(x), m_state(x.get_extent()), m_loss(x.get_extent()), m_config(config) {
fill(m_state, 0.0f);
fill(m_loss, 0.0f);
}
void step() {
parallel_for_each(x.extent, [&](index<Rank> idx) restrict(amp) {
m_state[idx] = lerp(m_config.alpha, m_state[idx], m_state[idx]);
});
}
};
int _tmain(int argc, _TCHAR* argv[])
{
network nn;
extent<1> size(8);
auto a = nn.make<module_input>(size);
auto b = nn.make<module_add<2>>(a->getOutput(), a->getOutput());
auto c = nn.make<module_div>(b->getOutput(), a->getOutput());
array<float, 1> data(size, boost::make_counting_iterator(1.0f));
a->setValue(data);
nn.updateOutput();
nn.updateGradInput();
printArray(a->getOutput().m_value);
printArray(b->getOutput().m_value);
printArray(c->getOutput().m_value);
getchar();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment