Skip to content

Instantly share code, notes, and snippets.

@mantognini
Created April 28, 2016 14:36
Show Gist options
  • Save mantognini/6863b2a42f1871d5728a6b9cd82a16fd to your computer and use it in GitHub Desktop.
Save mantognini/6863b2a42f1871d5728a6b9cd82a16fd to your computer and use it in GitHub Desktop.
//
// main.cpp
// MiniRelation
//
// Created by Marco Antognini on 28/04/16.
// Copyright © 2016 Marco Antognini. All rights reserved.
//
#include <algorithm>
#include <cassert>
#include <chrono>
#include <fstream>
#include <functional>
#include <iostream>
#include <iterator>
#include <map>
#include <sstream>
#include <stdexcept>
#include <unordered_map>
#include <utility>
#include <vector>
#define ROW_STORE 0
using Strings = std::vector<std::string>;
using Clock = std::chrono::high_resolution_clock;
template <class K, class V>
using Map = std::unordered_map<K, V>;
//using Map = std::map<K, V>;
class Schema {
Strings m_columns;
public:
Schema(Strings columns) : m_columns(std::move(columns)) { }
auto const& columns() const { return m_columns; }
auto size() const { return m_columns.size(); }
};
#if ROW_STORE == 1
class Row {
Strings m_values;
friend class Relation;
public:
Row(Strings values) : m_values(std::move(values)) { }
std::string const& getField(Schema const& schema, std::string const& fieldName) const {
assert(schema.size() == m_values.size());
// TODO use std::find?
std::size_t i = 0;
for (auto const& col : schema.columns()) {
if (col == fieldName) return m_values[i];
++i;
}
throw std::runtime_error("No such field <" + fieldName + "> in schema.");
}
};
using Rows = std::vector<Row>;
class Relation {
Schema m_schema;
Rows m_rows;
Relation(Schema schema, Rows rows) : m_schema(std::move(schema)), m_rows(std::move(rows)) { }
public:
/*!
* Scans a relation from a file, given its schema and a field delimiter
*/
static Relation scan(std::string const& filename, Schema schema, char delimiter) {
std::ifstream in(filename);
Rows rows;
assert(in);
std::string line;
while (std::getline(in, line)) {
Strings fields(schema.size());
std::istringstream iss(line);
for (std::size_t i = 0; i < schema.size(); ++i) {
assert(std::getline(iss, fields[i], delimiter));
}
rows.push_back(std::move(fields));
}
return Relation(std::move(schema), std::move(rows));
}
// Immutable operations!
/*!
* Selection operation: filters the rows, removing those that do *not* satisfy predicate `p`
* (i.e. keeping the one that satisfy `p`)
*/
template <class UnaryPredicate>
Relation select(UnaryPredicate p) const {
Rows selected;
std::copy_if(std::begin(m_rows), std::end(m_rows),
std::back_inserter(selected),
p);
return Relation(m_schema, selected);
}
/*!
* Projection operation: reduces the columns of a relation according to a new schema
*
* NOTE the `newSchema` must be an *ordered* subset of `schema`
*/
Relation project(Schema newSchema) const {
assert(newSchema.size() <= m_schema.size());
// TODO might be good to test std::stable_partition with a "mutable" predicate
// a bit like http://stackoverflow.com/a/21148808/520217
// Compute set of columns to remove
std::vector<bool> indexes(m_schema.size()); // false => keep
for (std::size_t i = 0, j = 0; i < m_schema.size(); ++i) {
if (i >= newSchema.size()) {
indexes[i] = true;
} else if (m_schema.columns()[i] == newSchema.columns()[j]) {
indexes[i] = false;
++j;
} else {
indexes[i] = true;
}
}
// Remove some columns
auto projected = m_rows;
for (auto& row : projected) {
auto it = std::begin(row.m_values);
for (std::size_t i = 0; i < indexes.size(); ++i) {
if (indexes[i]) {
it = row.m_values.erase(it);
} else {
++it;
}
}
}
return Relation(std::move(newSchema), std::move(projected));
}
/*!
* Equi-join operation: combines each rows from two tables when their keys match
* the result relation does not have the join-key column from the right relation
*/
Relation join(Relation right, std::string const& leftKey, std::string const& rightKey) {
// Find the index of rightKey for copying the schema and rows later
auto const& rightColumns = right.m_schema.columns();
auto rightKeyIt = std::find(std::begin(rightColumns), std::end(rightColumns), rightKey);
assert(rightKeyIt != std::end(rightColumns));
auto rightKeyIdx = std::distance(std::begin(rightColumns), rightKeyIt);
// Build new schema
auto newSize = m_schema.size() + right.m_schema.size() - 1;
Strings newColumns(newSize);
{
// Don't copy the right key
auto last = std::copy(std::begin(m_schema.columns()), std::end(m_schema.columns()), std::begin(newColumns));
last = std::copy(std::begin(rightColumns), rightKeyIt, last);
std::copy(rightKeyIt + 1, std::end(rightColumns), last);
}
Schema newSchema(newColumns);
Rows newRows;
for (auto const& r1 : m_rows) {
for (auto const& r2 : right.m_rows) {
if (r1.getField(m_schema, leftKey) == r2.getField(right.m_schema, rightKey)) {
Strings fields(newSize);
{
// Don't copy the right key
auto last = std::copy(std::begin(r1.m_values), std::end(r1.m_values), std::begin(fields));
auto rightSkipIt = std::begin(r2.m_values) + rightKeyIdx;
last = std::copy(std::begin(r2.m_values), rightSkipIt, last);
std::copy(rightSkipIt + 1, std::end(r2.m_values), last);
}
newRows.push_back(Row(std::move(fields)));
}
}
}
return Relation(newSchema, newRows);
}
/*!
* Prints the relation to standard output
*/
void print() const {
for (auto const& row : m_rows) {
for (auto const& field : row.m_values) {
std::cout << field << '|';
}
std::cout << '\n';
}
std::cout << std::flush;
}
};
#else
using Column = std::vector<std::string>;
//using Columns = Map<std::string, Column>; // TOO SLOW!
using Columns = std::vector<Column>; // using index for "key"
// Row is a prox only and doesn't own any record!
class Relation;
class Row {
friend Relation;
Relation const& m_relation;
std::size_t const m_index;
Row(Relation const& relation, std::size_t index) : m_relation(relation), m_index(index) { }
public:
std::string const& getField(Schema const&, std::string const& fieldName) const;
};
template <typename I>
class Range {
public:
class iterator {
friend class Range;
public:
I operator*() const { return m_i; }
iterator const& operator++() { ++m_i; return *this; }
iterator operator++(int) { iterator copy(*this); ++m_i; return copy; }
bool operator==(const iterator &other) const { return m_i == other.m_i; }
bool operator!=(const iterator &other) const { return m_i != other.m_i; }
protected:
iterator(I start) : m_i (start) { }
private:
I m_i;
};
iterator begin() const { return m_begin; }
iterator end() const { return m_end; }
Range(I begin, I end) : m_begin(begin), m_end(end) { }
private:
iterator m_begin;
iterator m_end;
};
template <typename I>
Range<I> makeRange(I begin, I end) { return { begin, end }; }
constexpr std::size_t Z = 0;
template <class C>
Range<std::size_t> makeRangeOnIndex(C const& container) {
return makeRange(Z, container.size());
}
class Relation {
Schema m_schema;
Columns m_cols;
friend Row;
Relation(Schema schema, Columns cols) : m_schema(std::move(schema)), m_cols(std::move(cols)) { }
public:
/*!
* Scans a relation from a file, given its schema and a field delimiter
*/
static Relation scan(std::string const& filename, Schema schema, char delimiter) {
std::ifstream in(filename);
Columns cols(schema.size());
assert(in);
std::string line;
while (std::getline(in, line)) {
std::istringstream iss(line);
for (auto const& column : makeRangeOnIndex(schema.columns())) {
std::string field;
assert(std::getline(iss, field, delimiter));
cols[column].push_back(std::move(field));
}
}
return Relation(std::move(schema), std::move(cols));
}
// Immutable operations!
/*!
* Selection operation: filters the rows, removing those that do *not* satisfy predicate `p`
* (i.e. keeping the one that satisfy `p`)
*
* NOTE `p` is Row => bool
*/
template <class UnaryPredicate>
Relation select(UnaryPredicate p) const {
assert(m_schema.size() > 0);
auto size = m_cols[0].size();
Columns newCols(m_schema.size());
for (std::size_t i = 0; i < size; ++i) {
Row row(*this, i);
if (p(row)) {
for (auto const& column : makeRangeOnIndex(m_schema.columns())) {
newCols[column].push_back(m_cols[column][i]);
}
}
}
return Relation(m_schema, newCols);
}
/*!
* Projection operation: reduces the columns of a relation according to a new schema
*
* NOTE the `newSchema` must be an *ordered* subset of `schema`
*/
Relation project(Schema newSchema) const {
assert(newSchema.size() <= m_schema.size());
Columns newCols(newSchema.size());
for (auto const& column : makeRangeOnIndex(newSchema.columns())) {
newCols[column] = m_cols[column];
}
return Relation(newSchema, newCols);
}
/*!
* Equi-join operation: combines each rows from two tables when their keys match
* the result relation does not have the join-key column from the right relation
*/
Relation join(Relation right, std::string const& leftKey, std::string const& rightKey) {
// Find the index of left & right key for copying the schema and rows later
auto const& rightColumns = right.m_schema.columns();
auto rightKeyIt = std::find(std::begin(rightColumns), std::end(rightColumns), rightKey);
assert(rightKeyIt != std::end(rightColumns));
auto rightKeyIdx = std::distance(std::begin(rightColumns), rightKeyIt);
auto const& leftColumns = m_schema.columns();
auto leftKeyIt = std::find(std::begin(leftColumns), std::end(leftColumns), leftKey);
assert(leftKeyIt != std::end(leftColumns));
auto leftKeyIdx = std::distance(std::begin(leftColumns), leftKeyIt);
// Build new schema
auto newSize = m_schema.size() + right.m_schema.size() - 1;
Strings newColumns(newSize);
{
// Don't copy the right key
auto last = std::copy(std::begin(m_schema.columns()), std::end(m_schema.columns()), std::begin(newColumns));
last = std::copy(std::begin(rightColumns), rightKeyIt, last);
std::copy(rightKeyIt + 1, std::end(rightColumns), last);
}
Schema newSchema(newColumns);
Columns newCols(newSchema.size());
auto rightOffset = m_schema.size();
auto sizeLeft = m_cols[0].size();
auto sizeRight = right.m_cols[0].size();
for (std::size_t i = 0; i < sizeLeft; ++i) {
for (std::size_t j = 0; j < sizeRight; ++j) {
if (m_cols[leftKeyIdx][i] == right.m_cols[rightKeyIdx][j]) {
for (auto const& column : makeRangeOnIndex(m_schema.columns())) {
newCols[column].push_back(m_cols[column][i]);
}
for (auto column : makeRangeOnIndex(m_schema.columns())) {
if (column == rightKeyIdx) continue;
else if (column > rightKeyIdx) {
newCols[rightOffset + column - 1].push_back(right.m_cols[column][i]);
} else {
newCols[rightOffset + column].push_back(right.m_cols[column][i]);
}
}
}
}
}
return Relation(newSchema, newCols);
}
/*!
* Prints the relation to standard output
*/
void print() const {
assert(m_schema.size() > 0);
auto size = m_cols[0].size();
for (std::size_t i = 0; i < size; ++i) {
for (auto const& column : makeRangeOnIndex(m_schema.columns())) {
std::cout << m_cols[column][i] << '|';
}
std::cout << '\n';
}
std::cout << std::flush;
}
};
std::string const& Row::getField(Schema const&, std::string const& fieldName) const {
auto const& columns = m_relation.m_schema.columns();
auto fieldIt = std::find(std::begin(columns), std::end(columns), fieldName);
assert(fieldIt != std::end(columns));
auto fieldIdx = std::distance(std::begin(columns), fieldIt);
return m_relation.m_cols[fieldIdx][m_index];
}
#endif
int main(int argc, const char * argv[]) {
enum class Example { A, B, C };
auto pgm = Example::B;
auto EnCSV = "data/En.csv";
auto FrCSV = "data/Fr.csv";
auto start = Clock::now();
switch (pgm) {
case Example::A:
{
auto schema = Schema({"number", "digit"});
auto En = Relation::scan(EnCSV, schema, '|');
auto selEn = En.select([&schema](auto x) { return x.getField(schema, "number") == "one"; });
auto projEn = selEn.project(Schema({"number"}));
projEn.print();
}
break;
case Example::B:
{
auto EnSchema = Schema({"number", "digit"});
auto En = Relation::scan(EnCSV, EnSchema, '|');
auto FrSchema = Schema({"digit", "nombre"});
auto Fr = Relation::scan(FrCSV, FrSchema, '|');
auto EnFr = En.join(Fr, "digit", "digit");
EnFr.print();
}
break;
case Example::C:
{
auto EnSchema = Schema({"number", "digit"});
auto En = Relation::scan(EnCSV, EnSchema, '|').select([&EnSchema](auto x) {
return x.getField(EnSchema, "number") == "one";
});
auto projEn = En.project(Schema({"number"}));
projEn.print();
}
break;
}
auto end = Clock::now();
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Time elapsed: " << ms.count() << "ms\n";
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment