Created
April 28, 2016 14:36
-
-
Save mantognini/6863b2a42f1871d5728a6b9cd82a16fd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// main.cpp | |
// MiniRelation | |
// | |
// Created by Marco Antognini on 28/04/16. | |
// Copyright © 2016 Marco Antognini. All rights reserved. | |
// | |
#include <algorithm> | |
#include <cassert> | |
#include <chrono> | |
#include <fstream> | |
#include <functional> | |
#include <iostream> | |
#include <iterator> | |
#include <map> | |
#include <sstream> | |
#include <stdexcept> | |
#include <unordered_map> | |
#include <utility> | |
#include <vector> | |
#define ROW_STORE 0 | |
using Strings = std::vector<std::string>; | |
using Clock = std::chrono::high_resolution_clock; | |
template <class K, class V> | |
using Map = std::unordered_map<K, V>; | |
//using Map = std::map<K, V>; | |
class Schema { | |
Strings m_columns; | |
public: | |
Schema(Strings columns) : m_columns(std::move(columns)) { } | |
auto const& columns() const { return m_columns; } | |
auto size() const { return m_columns.size(); } | |
}; | |
#if ROW_STORE == 1 | |
class Row { | |
Strings m_values; | |
friend class Relation; | |
public: | |
Row(Strings values) : m_values(std::move(values)) { } | |
std::string const& getField(Schema const& schema, std::string const& fieldName) const { | |
assert(schema.size() == m_values.size()); | |
// TODO use std::find? | |
std::size_t i = 0; | |
for (auto const& col : schema.columns()) { | |
if (col == fieldName) return m_values[i]; | |
++i; | |
} | |
throw std::runtime_error("No such field <" + fieldName + "> in schema."); | |
} | |
}; | |
using Rows = std::vector<Row>; | |
class Relation { | |
Schema m_schema; | |
Rows m_rows; | |
Relation(Schema schema, Rows rows) : m_schema(std::move(schema)), m_rows(std::move(rows)) { } | |
public: | |
/*! | |
* Scans a relation from a file, given its schema and a field delimiter | |
*/ | |
static Relation scan(std::string const& filename, Schema schema, char delimiter) { | |
std::ifstream in(filename); | |
Rows rows; | |
assert(in); | |
std::string line; | |
while (std::getline(in, line)) { | |
Strings fields(schema.size()); | |
std::istringstream iss(line); | |
for (std::size_t i = 0; i < schema.size(); ++i) { | |
assert(std::getline(iss, fields[i], delimiter)); | |
} | |
rows.push_back(std::move(fields)); | |
} | |
return Relation(std::move(schema), std::move(rows)); | |
} | |
// Immutable operations! | |
/*! | |
* Selection operation: filters the rows, removing those that do *not* satisfy predicate `p` | |
* (i.e. keeping the one that satisfy `p`) | |
*/ | |
template <class UnaryPredicate> | |
Relation select(UnaryPredicate p) const { | |
Rows selected; | |
std::copy_if(std::begin(m_rows), std::end(m_rows), | |
std::back_inserter(selected), | |
p); | |
return Relation(m_schema, selected); | |
} | |
/*! | |
* Projection operation: reduces the columns of a relation according to a new schema | |
* | |
* NOTE the `newSchema` must be an *ordered* subset of `schema` | |
*/ | |
Relation project(Schema newSchema) const { | |
assert(newSchema.size() <= m_schema.size()); | |
// TODO might be good to test std::stable_partition with a "mutable" predicate | |
// a bit like http://stackoverflow.com/a/21148808/520217 | |
// Compute set of columns to remove | |
std::vector<bool> indexes(m_schema.size()); // false => keep | |
for (std::size_t i = 0, j = 0; i < m_schema.size(); ++i) { | |
if (i >= newSchema.size()) { | |
indexes[i] = true; | |
} else if (m_schema.columns()[i] == newSchema.columns()[j]) { | |
indexes[i] = false; | |
++j; | |
} else { | |
indexes[i] = true; | |
} | |
} | |
// Remove some columns | |
auto projected = m_rows; | |
for (auto& row : projected) { | |
auto it = std::begin(row.m_values); | |
for (std::size_t i = 0; i < indexes.size(); ++i) { | |
if (indexes[i]) { | |
it = row.m_values.erase(it); | |
} else { | |
++it; | |
} | |
} | |
} | |
return Relation(std::move(newSchema), std::move(projected)); | |
} | |
/*! | |
* Equi-join operation: combines each rows from two tables when their keys match | |
* the result relation does not have the join-key column from the right relation | |
*/ | |
Relation join(Relation right, std::string const& leftKey, std::string const& rightKey) { | |
// Find the index of rightKey for copying the schema and rows later | |
auto const& rightColumns = right.m_schema.columns(); | |
auto rightKeyIt = std::find(std::begin(rightColumns), std::end(rightColumns), rightKey); | |
assert(rightKeyIt != std::end(rightColumns)); | |
auto rightKeyIdx = std::distance(std::begin(rightColumns), rightKeyIt); | |
// Build new schema | |
auto newSize = m_schema.size() + right.m_schema.size() - 1; | |
Strings newColumns(newSize); | |
{ | |
// Don't copy the right key | |
auto last = std::copy(std::begin(m_schema.columns()), std::end(m_schema.columns()), std::begin(newColumns)); | |
last = std::copy(std::begin(rightColumns), rightKeyIt, last); | |
std::copy(rightKeyIt + 1, std::end(rightColumns), last); | |
} | |
Schema newSchema(newColumns); | |
Rows newRows; | |
for (auto const& r1 : m_rows) { | |
for (auto const& r2 : right.m_rows) { | |
if (r1.getField(m_schema, leftKey) == r2.getField(right.m_schema, rightKey)) { | |
Strings fields(newSize); | |
{ | |
// Don't copy the right key | |
auto last = std::copy(std::begin(r1.m_values), std::end(r1.m_values), std::begin(fields)); | |
auto rightSkipIt = std::begin(r2.m_values) + rightKeyIdx; | |
last = std::copy(std::begin(r2.m_values), rightSkipIt, last); | |
std::copy(rightSkipIt + 1, std::end(r2.m_values), last); | |
} | |
newRows.push_back(Row(std::move(fields))); | |
} | |
} | |
} | |
return Relation(newSchema, newRows); | |
} | |
/*! | |
* Prints the relation to standard output | |
*/ | |
void print() const { | |
for (auto const& row : m_rows) { | |
for (auto const& field : row.m_values) { | |
std::cout << field << '|'; | |
} | |
std::cout << '\n'; | |
} | |
std::cout << std::flush; | |
} | |
}; | |
#else | |
using Column = std::vector<std::string>; | |
//using Columns = Map<std::string, Column>; // TOO SLOW! | |
using Columns = std::vector<Column>; // using index for "key" | |
// Row is a prox only and doesn't own any record! | |
class Relation; | |
class Row { | |
friend Relation; | |
Relation const& m_relation; | |
std::size_t const m_index; | |
Row(Relation const& relation, std::size_t index) : m_relation(relation), m_index(index) { } | |
public: | |
std::string const& getField(Schema const&, std::string const& fieldName) const; | |
}; | |
template <typename I> | |
class Range { | |
public: | |
class iterator { | |
friend class Range; | |
public: | |
I operator*() const { return m_i; } | |
iterator const& operator++() { ++m_i; return *this; } | |
iterator operator++(int) { iterator copy(*this); ++m_i; return copy; } | |
bool operator==(const iterator &other) const { return m_i == other.m_i; } | |
bool operator!=(const iterator &other) const { return m_i != other.m_i; } | |
protected: | |
iterator(I start) : m_i (start) { } | |
private: | |
I m_i; | |
}; | |
iterator begin() const { return m_begin; } | |
iterator end() const { return m_end; } | |
Range(I begin, I end) : m_begin(begin), m_end(end) { } | |
private: | |
iterator m_begin; | |
iterator m_end; | |
}; | |
template <typename I> | |
Range<I> makeRange(I begin, I end) { return { begin, end }; } | |
constexpr std::size_t Z = 0; | |
template <class C> | |
Range<std::size_t> makeRangeOnIndex(C const& container) { | |
return makeRange(Z, container.size()); | |
} | |
class Relation { | |
Schema m_schema; | |
Columns m_cols; | |
friend Row; | |
Relation(Schema schema, Columns cols) : m_schema(std::move(schema)), m_cols(std::move(cols)) { } | |
public: | |
/*! | |
* Scans a relation from a file, given its schema and a field delimiter | |
*/ | |
static Relation scan(std::string const& filename, Schema schema, char delimiter) { | |
std::ifstream in(filename); | |
Columns cols(schema.size()); | |
assert(in); | |
std::string line; | |
while (std::getline(in, line)) { | |
std::istringstream iss(line); | |
for (auto const& column : makeRangeOnIndex(schema.columns())) { | |
std::string field; | |
assert(std::getline(iss, field, delimiter)); | |
cols[column].push_back(std::move(field)); | |
} | |
} | |
return Relation(std::move(schema), std::move(cols)); | |
} | |
// Immutable operations! | |
/*! | |
* Selection operation: filters the rows, removing those that do *not* satisfy predicate `p` | |
* (i.e. keeping the one that satisfy `p`) | |
* | |
* NOTE `p` is Row => bool | |
*/ | |
template <class UnaryPredicate> | |
Relation select(UnaryPredicate p) const { | |
assert(m_schema.size() > 0); | |
auto size = m_cols[0].size(); | |
Columns newCols(m_schema.size()); | |
for (std::size_t i = 0; i < size; ++i) { | |
Row row(*this, i); | |
if (p(row)) { | |
for (auto const& column : makeRangeOnIndex(m_schema.columns())) { | |
newCols[column].push_back(m_cols[column][i]); | |
} | |
} | |
} | |
return Relation(m_schema, newCols); | |
} | |
/*! | |
* Projection operation: reduces the columns of a relation according to a new schema | |
* | |
* NOTE the `newSchema` must be an *ordered* subset of `schema` | |
*/ | |
Relation project(Schema newSchema) const { | |
assert(newSchema.size() <= m_schema.size()); | |
Columns newCols(newSchema.size()); | |
for (auto const& column : makeRangeOnIndex(newSchema.columns())) { | |
newCols[column] = m_cols[column]; | |
} | |
return Relation(newSchema, newCols); | |
} | |
/*! | |
* Equi-join operation: combines each rows from two tables when their keys match | |
* the result relation does not have the join-key column from the right relation | |
*/ | |
Relation join(Relation right, std::string const& leftKey, std::string const& rightKey) { | |
// Find the index of left & right key for copying the schema and rows later | |
auto const& rightColumns = right.m_schema.columns(); | |
auto rightKeyIt = std::find(std::begin(rightColumns), std::end(rightColumns), rightKey); | |
assert(rightKeyIt != std::end(rightColumns)); | |
auto rightKeyIdx = std::distance(std::begin(rightColumns), rightKeyIt); | |
auto const& leftColumns = m_schema.columns(); | |
auto leftKeyIt = std::find(std::begin(leftColumns), std::end(leftColumns), leftKey); | |
assert(leftKeyIt != std::end(leftColumns)); | |
auto leftKeyIdx = std::distance(std::begin(leftColumns), leftKeyIt); | |
// Build new schema | |
auto newSize = m_schema.size() + right.m_schema.size() - 1; | |
Strings newColumns(newSize); | |
{ | |
// Don't copy the right key | |
auto last = std::copy(std::begin(m_schema.columns()), std::end(m_schema.columns()), std::begin(newColumns)); | |
last = std::copy(std::begin(rightColumns), rightKeyIt, last); | |
std::copy(rightKeyIt + 1, std::end(rightColumns), last); | |
} | |
Schema newSchema(newColumns); | |
Columns newCols(newSchema.size()); | |
auto rightOffset = m_schema.size(); | |
auto sizeLeft = m_cols[0].size(); | |
auto sizeRight = right.m_cols[0].size(); | |
for (std::size_t i = 0; i < sizeLeft; ++i) { | |
for (std::size_t j = 0; j < sizeRight; ++j) { | |
if (m_cols[leftKeyIdx][i] == right.m_cols[rightKeyIdx][j]) { | |
for (auto const& column : makeRangeOnIndex(m_schema.columns())) { | |
newCols[column].push_back(m_cols[column][i]); | |
} | |
for (auto column : makeRangeOnIndex(m_schema.columns())) { | |
if (column == rightKeyIdx) continue; | |
else if (column > rightKeyIdx) { | |
newCols[rightOffset + column - 1].push_back(right.m_cols[column][i]); | |
} else { | |
newCols[rightOffset + column].push_back(right.m_cols[column][i]); | |
} | |
} | |
} | |
} | |
} | |
return Relation(newSchema, newCols); | |
} | |
/*! | |
* Prints the relation to standard output | |
*/ | |
void print() const { | |
assert(m_schema.size() > 0); | |
auto size = m_cols[0].size(); | |
for (std::size_t i = 0; i < size; ++i) { | |
for (auto const& column : makeRangeOnIndex(m_schema.columns())) { | |
std::cout << m_cols[column][i] << '|'; | |
} | |
std::cout << '\n'; | |
} | |
std::cout << std::flush; | |
} | |
}; | |
std::string const& Row::getField(Schema const&, std::string const& fieldName) const { | |
auto const& columns = m_relation.m_schema.columns(); | |
auto fieldIt = std::find(std::begin(columns), std::end(columns), fieldName); | |
assert(fieldIt != std::end(columns)); | |
auto fieldIdx = std::distance(std::begin(columns), fieldIt); | |
return m_relation.m_cols[fieldIdx][m_index]; | |
} | |
#endif | |
int main(int argc, const char * argv[]) { | |
enum class Example { A, B, C }; | |
auto pgm = Example::B; | |
auto EnCSV = "data/En.csv"; | |
auto FrCSV = "data/Fr.csv"; | |
auto start = Clock::now(); | |
switch (pgm) { | |
case Example::A: | |
{ | |
auto schema = Schema({"number", "digit"}); | |
auto En = Relation::scan(EnCSV, schema, '|'); | |
auto selEn = En.select([&schema](auto x) { return x.getField(schema, "number") == "one"; }); | |
auto projEn = selEn.project(Schema({"number"})); | |
projEn.print(); | |
} | |
break; | |
case Example::B: | |
{ | |
auto EnSchema = Schema({"number", "digit"}); | |
auto En = Relation::scan(EnCSV, EnSchema, '|'); | |
auto FrSchema = Schema({"digit", "nombre"}); | |
auto Fr = Relation::scan(FrCSV, FrSchema, '|'); | |
auto EnFr = En.join(Fr, "digit", "digit"); | |
EnFr.print(); | |
} | |
break; | |
case Example::C: | |
{ | |
auto EnSchema = Schema({"number", "digit"}); | |
auto En = Relation::scan(EnCSV, EnSchema, '|').select([&EnSchema](auto x) { | |
return x.getField(EnSchema, "number") == "one"; | |
}); | |
auto projEn = En.project(Schema({"number"})); | |
projEn.print(); | |
} | |
break; | |
} | |
auto end = Clock::now(); | |
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start); | |
std::cout << "Time elapsed: " << ms.count() << "ms\n"; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment