Created
May 5, 2015 18:32
-
-
Save pudo/d810d91778e73e991b48 to your computer and use it in GitHub Desktop.
A proposed metadata structure for OpenSpending raw data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is an alternate proposal for a metadata structure for OpenSpending | |
# data models. The most significant change is that data is modelled in a | |
# way that highlights logical connections between fields, rather based on | |
# columns. This also means that column naming conventions are not needed. | |
# | |
# This proposal uses YAML to represent the model, but implementations | |
# would probably use JSON instead. | |
# The proposed format is currently supported by spendb and cubepress. | |
# | |
# The following is a data model for a fictitious budget/spending dataset. | |
metadata: | |
label: "My little budget" | |
# etc. etc. | |
model: | |
# While this uses OLAP terminology, other terms are also available: | |
# independent/dependent variables; concepts; etc. | |
# | |
# For terminology, see also: | |
# http://www.w3.org/TR/vocab-data-cube/#cubes-model | |
# https://developers.google.com/public-data/ | |
dimensions: | |
project: | |
label: "Project" | |
description: "Project under which funds were released" | |
# Each dimension can have N attributes, which related to | |
# actual columns in the underlying data. | |
attributes: | |
name: | |
label: "Project name" | |
column: "project_name" | |
type: "string" | |
label: true | |
id: | |
label: "Project ID" | |
column: "project_id" | |
type: "string" | |
content: "slug" | |
# TODO: indicate that this should be used for | |
# identification | |
description: | |
label: "Project description" | |
column: "project_description" | |
type: "string" | |
content: "longtext" | |
url: | |
label: "Project info URL" | |
column: "project_url" | |
type: "string" | |
content: "url" | |
supplier: | |
label: "Project contractor/Supplier" | |
description: "Contractor hired for this project" | |
# The semantics of this dimension are annotated, instead | |
# of being implicit in the column name: | |
class: "entity:company" | |
# Global dimension alignment, which - in theory - can do | |
# fairly whacky stuff (doesn't need to be a fixed table): | |
align: | |
type: reconcile | |
url: http://opencorporates.com/reconcile/gb | |
attributes: | |
name: | |
label: "Supplier name" | |
column: "supplier_name" | |
type: "string" | |
label: true | |
vat_id: | |
label: "Supplier VAT ID" | |
column: "supplier_vat" | |
type: "string" | |
sic: | |
label: "Supplier Industry" | |
column: "supplier_sic" | |
type: "integer" | |
function: | |
label: "Function" | |
description: "Functional classification of the project" | |
# So we build out a hierarchy of these: | |
class: "budget:functional" | |
# e.g. budget:economic, budget:items, entity:publicbody | |
# A more traditional option for the alignment: | |
align: | |
type: mapping | |
taxonomy: un:cofog:2001 | |
attributes: | |
name: | |
label: "Function name" | |
column: "function_name" | |
type: "string" | |
label: true | |
code: | |
label: "Function code" | |
column: "function_code" | |
type: "string" | |
# No reason to stick with the old OpenSpending ``time`` and | |
# ``amount`` convention: | |
disbursement_date: | |
label: "Disbursement" | |
description: "Date of the actual disbursement" | |
attributes: | |
# It's easier to treat parts of dates as normal | |
# attributes, but one could provide some syntactic | |
# sugar for extracting the actual values from the | |
# same underlying column. | |
year: | |
label: "Year" | |
column: "disbursement_date|year" | |
type: "date" | |
format: "YYYY-MM-DD" | |
month: | |
label: "Month" | |
column: "disbursement_date|month" | |
type: "date" | |
format: "YYYY-MM-DD" | |
day: | |
label: "Day" | |
column: "disbursement_date|day" | |
type: "date" | |
format: "YYYY-MM-DD" | |
measures: | |
value: | |
label: "Project value" | |
column: "value" | |
type: "decimal" | |
class: "economic:money" | |
# This should definitely no longer be a property of the | |
# dataset: | |
unit: | |
factor: 1 | |
currency: "GBP" | |
base_year: 2000 | |
# This will only support loading data from a single source file. It | |
# is quite easy, however, to define a set of sources which could be | |
# referenced from the model: | |
sources: | |
main_table: | |
file: "projects_budget.csv" | |
supplier_info: | |
file: "suppliers.csv" | |
join: | |
remote: main_table:supplier_vat | |
local: vat_id | |
# Given such a specification, the column specifications in the model | |
# could easily be qualified: ``main_table:disbursement_date``. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment