Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lisitsyn/a6d8ff6e8690431f967c5318c3750919 to your computer and use it in GitHub Desktop.
Save lisitsyn/a6d8ff6e8690431f967c5318c3750919 to your computer and use it in GitHub Desktop.
Heiko is worrie that we will loose it :)
// Immutable features
// linear model
- dot prod:
- pairs
-
- cov var matrix: there's a Feature operator => CovarView => matrix
-
class Features {
Features(){}
...
Features transformed_by(Transformer t) const;
// this evaluates the stacked operators over the features
// and returns the copy of features
Features cached() const;
protected:
void add_flag(flag) {
flags |= flag;
}
int get_flags()
}
class DotFeatures : Features
{
double inner(iterator, iterator) = 0;
double inner(iterator, Vector) = 0;
double inner(batch, Vector w) { for each in batch: collect(inner(each, w)) }
double inner(batch l , batch r) { for each_l in l: for each_r in r: collect(inner(each_l, each_r)) }
}
class DenseFeatures : DotFeatures
{
// this is more efficient due to storage
double inner(batch, Vector w) { linalg::do(matrix(batch) * w) }
double inner(batch l, batch r) { linalg;:do(matrix(l) * matrix(r)) }
Matrix matrix(batch) {
return a submatrix if possible (batch is continuous and no preprocessors left)
or
create new matrix
}
iterator begin() {
// magic choosing between ranges and indices
}
vector<Range> ranges;
vector<Index> indices;
shared<Matrix> data;
}
class StreamingDotFeatures {
}
df = DataFrame()
f = Features(df).tranformed_by(Mean).transformed_by(Normalize).cached();
class Covariance {
Covariance(some<Features> f, Options options) { dot_f = as_dot_features(f); }
Matrix full_matrix() const { covariance = zeros(); ... call dot_f.outer_into(covariance, each, other) or dot_f.batched_dot ... }
Matrix diagonal() const { ... compute a few of them using the same procedure ... }
some<DotFeatures> dot_f;
}
class DotIterator {
double inner(DotIterator other) const {
dot_f->inner(current, other);
}
double inner(Vector other) const;
void add(Vector other, double alpha)
void auto_outer_into(Matrix outers) const;
void outer_into(DotIterator other, Matrix outer) const;
DotFeatures dot_f;
iterator current;
}
class BatchedDotIterator {
Batched(DotIterator)
same stuff ^ but batched
}
SGDRegressor {
// NO STATE BUT W
SGDRegressor()
train(Features f) {
.. check centered ..
// warm-start?
auto w = initial_w(f):
auto iterator = BatchedDotIterator(f);
auto alpha = get(ALPHA);
while (not stopped and iterator.has_more()) {
iterator.add(w, alpha);
iterator++;
}
set(W, w);
}
}
sgd = SGDRegressor();
f = DenseFeatures(pd.read_csv('concrete.csv'));
sgd.set("warm_start", Vector::RandomLaplace(f.n_features()));
splitting = BinarySplitSubset(f)
train, test = splitting.first(), splitting.second()
sgd.train(train)
sgd.apply(test)
f = StreamingFeatures(HdfsStream("hdfs://my_porno_data"))
sgd.train(HeadSubset(f, 1e6))
LinearModel {
// NO STATE BUT W
LinearModel()
Vector initial_w(Features f);
train(Features f) {
// assert
assert(features.get_flags() & CENTERED);
// we do not expose on model level the type of matrix/vector
// linalg should find it out, we have an opaque Matrix/Vector class
// which is type agnostic.
Matrix covariance = Covariance(f).full_matrix();
linalg::add_diag(covariance, get(LAMBDA));
set(W, linalg::cholesky_solve(covariance, y));
}
Tag<Vector> LAMBDA;
Tag<Vector> W;
}
batch optimisation is taken care automagically
model.train(SubsetView(features))
Kernel use cases:
1) Kernel ridge regression (one need to compute the whole kernel matrix)
2) Kernel SVM (compute kernel for a pair and on batches to apply)
3) Streaming MMD (compute kernel for a pair from two different streams)
4) Combined kernel (all of the above)
*) Kernel matrix precomputation
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment