Skip to content

Instantly share code, notes, and snippets.

@pudo
Created October 4, 2011 09:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pudo/1261271 to your computer and use it in GitHub Desktop.
Save pudo/1261271 to your computer and use it in GitHub Desktop.
/* Proposed OpenSpending data model, v2 */
/* Core issues we want to address:
*
* - Merge "entities" and "classifiers".
* - Use better terminology.
* - Decide whether to still have value dimensions.
* - Handle time better (no "end_column")
*
* Secondary but long-term goals:
*
* - Dimension hierarchies.
* - Soft typing of dimensions.
*/
var model_proposal_fl = {
/* Dataset metadata */
"dataset": {
/* Name, needs to have a number of forbidden words. */
"name": "test",
/* Label, a free-form description. */
"label": "Example Dataset",
"description": "I'm an example dataset, not much to see here.",
/* Set of entry properties sufficient to make each entry unique in
* the dataset. */
"unique_keys": ["transaction_id"],
},
"mapping": {
"amount": {
"label": "Total amount",
"description": "...",
/* New type of field in mapping: */
"type": "measure",
/* Don't do float for money any longer: */
"datatype": "decimal",
"column": "amount"
/* TODO: Should this have support for default values and
* constant values? */
},
/* Secondary measure: */
"cofinance_amount": {
"label": "Co-financed amount",
"description": "...",
/* New type of field in mapping: */
"type": "measure",
/* Don't do float for money any longer: */
"datatype": "decimal",
"column": "cofinance"
},
/* Time dimension. Note it is not called "time". Good idea? */
"grant_date": {
"label": "Date of grant",
"description": "...",
/* Optional: type for field in mapping. */
"type": "date",
"datatype": "date",
/* NOTE
* This is a bastard type: the datatype should first yield python date
* objects but then we'll store something else to the database: either
* a DATETIME column or (cleaner) a proper date table with multiple
* fields:
*
* date_id, year, month, quarter, day
*/
"column": "date_grant"
/* NOTE
* I propose we abolish the "end_column" hack and have multiple date
* dimensions if necessary: grant_date, disbursed_date, project_end.
*/
},
/* Have simple attribute dimensions for non-compound entities with many
* different values: */
"transaction_id": {
"label": "Transaction ID",
"description": "...",
"facet": false,
/* Could also be "attribute": */
"type": "value",
"column": "transaction_id",
/* This is assumed by default: */
"datatype": "string"
},
/* The thing formerly known as an entity: */
"beneficiary": {
"label": "Beneficiary",
"description": "...",
/* This is assumed as a default, supersedes "type": "entity": */
"scheme": "entity",
/* Just an idea, hard to actually enumerate: */
"classes": ["individual", "company", "nonprofit"],
"facet": true,
/* IMO these can also have this - but nick will disagree? */
"unqiue_keys": ["label", "country"],
"attributes": {
/* Always enforce presence of a name attribute? */
"name": {
/* Re-name ID */
"datatype": "slug",
"column": "beneficiary"
},
"label": {
"datatype": "string",
"column": "beneficiary"
},
"country": {
"datatype": "string",
"column": "beneficiary_country"
}
}
},
/* The thing formerly known as a classifier: */
"objective": {
"label": "Objective (Level 1)",
"description": "...",
"scheme": "funding-taxonomy",
"facet": true,
"attributes": {
"name": {
"datatype": "slug",
"column": "objective"
},
"label": {
"datatype": "string",
"column": "objective"
}
}
},
/* Support hierarchies of classifiers: */
"goal": {
"label": "Goal (Level 2)",
"description": "...",
"scheme": "funding-taxonomy",
/* Proposal: let's have the parent classifier given by name and specify
* a column on this dimension that refers back to the higher-level
* dimension. The latter is not strictly necessary, I think.
*/
"parent": {"objective": "objective_name"},
"attributes": {
"name": {
"datatype": "slug",
"column": "goal"
},
"label": {
"datatype": "string",
"column": "goal"
}
"objective_name": {
"datatype": "slug",
"column": "objective"
},
}
},
/*
*
* ALTERNATIVES
*
*/
/*
* mk270: remove "Value Dimensions" completely, have all dimensions with
* attributes:
*/
"transaction_id": {
"label": "Transaction ID",
"description": "...",
"facet": false,
"attributes": [{
/* Could also be "attribute": */
"column": "transaction_id",
/* This is assumed by default: */
"datatype": "string"
}]
/* Criticism:
* 1) Utility: its nice and easy to have attributes on entries, although
* not strictly clean.
* 2) Makes nicer hash representation of attributes impossible.
* 3) How do you enforce there's only one?
* 4) Given that "value dimensions" can be generated by the same code as
* "dimension attributes" - is it really easier to generate this?
*/
},
/*
* pudo: separate "Dimension Model" from "Source file mapping":
*
* -> This is one level up, mentally:
*/
/* This is fully abstract, e.g. for all of ERDF, ESF or some such collection
* of data sets:
*/
"model": {
"measure1": { /* Full description. */ }
"measure2": { /* Full description. */ }
"dimension1": { /* Full description with attributes. */ }
"dimension2": { /* Full description with attributes. */ }
"dimension3": { /* Full description with attributes. */ }
"dimension4": { /* Full description with attributes. */ }
}
/* This is specific to one CSV representation. */
"mapping": {
"measure1": {"column": "measure", "datatype": "decimal"}
"measure2": {"column": "other_measure", "datatype": "decimal"}
"dimension1.attrib1": {"column": "foo", "datatype": "string"}
"dimension1.attrib2": {"column": "bar", "datatype": "string"}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment