Skip to content

Instantly share code, notes, and snippets.

@jaor
Forked from aficionado/JSON PML schemas.md
Last active December 11, 2015 07:19
Show Gist options
  • Save jaor/4565594 to your computer and use it in GitHub Desktop.
Save jaor/4565594 to your computer and use it in GitHub Desktop.
JSON schemas for models
{
"description":"A list of callback specifications. They're invoked
upon resource state changes.",
"type":"array",
"items":{
"description":"Each callback specification contains at an HTTP method,
a URL, a list of triggering states and a synchronicity flag.",
"type":"object",
"properties":{
"method":{
"description":"The HTTP method to use.",
"optional":true,
"default":"GET",
"enum":["GET", "POST", "PUT", "DELETE", "get", "post", "put", "delete"],
},
"url":{
"description":"The URL to invoke with the method above.",
"type":"string"
},
"states":{
"description":"A list of states for which the callback should be
called. If not given, the callback will be called
for all state changes.",
"optional":true,
"type":"array",
"items":["string"]
},
"synchronous":{
"description":"A flag specifying wether the callback is synchronous and
we need to wait for a remote response before proceeding.",
"optional":true,
"default":false,
"type":"boolean"
}
}
}
}
{
"description":"A collection of fields",
"type":"object",
"additionalProperties":{"$ref":"field-schema.json"}
}
{
"description":"A field from the dataset",
"type":[
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A text field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["text"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":["string","boolean"]
},
"summary":{
"type":"object",
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":"integer"
}
}
}
}
},
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A categorical field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["categorical"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":["string"]
},
"summary":{
"type":"object",
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":["integer","null"]
},
"categories":{
"description":"The possible categories for categorical fields",
"type":"array",
"items":{
"type":"array",
"items":[
{"type":["string", "number"]},
{"type":["integer","null"]}
]
}
}
}
}
}
},
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A numeric field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["numeric"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":[
"integer","int8","int16","int32","int64",
"float","double","day","month","year","hour",
"minute","second","millisecond","day-of-week","day-of-month"
]
},
"summary":{
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":"integer"
},
"maximum":{
"description":"Maximum value for numeric fields",
"optional":true,
"type":"number"
},
"minimum":{
"description":"Minimum value for numeric fields",
"optional":true,
"type":"number"
},
"median":{
"description":"The approximate median for numeric fields",
"optional":true,
"type":"number"
},
"sum":{
"description":"Sum of values (for mean calculation)",
"type":"number"
},
"sum_squares":{
"description":"Sum of squared values (for variance calculation)",
"type":"number"
},
"population":{
"description":"# of instances containing data for this field",
"type":"integer"
},
"mean":{
"description":"The sample mean for numeric fields",
"optional":true,
"type":"number"
},
"variance":{
"description":"The sample variance for numeric fields",
"optional":true,
"type":"number"
},
"standard_deviation":{
"description":"The sample standard deviation for numeric fields",
"optional":true,
"type":"number"
},
"splits":{
"description":"DEPRECATED - Histogram split points for this field",
"optional":true,
"type":"array",
"items":{
"type":"number"
}
},
"counts":{
"description":"Captures the distribution for the field. Contains
tuples of the unique values and their occurrence counts.
Used when there are 32 or less unique numeric values",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":"number"},
{"type":"integer"}
]
}
},
"bins":{
"description":"Captures the distribution for the field.
Each tuple represents a bin from an approximate
histogram. Each bin contains the bin mean and a
membership count. Used when there are more than 32
unique numeric values.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":"number"},
{"type":"integer"}
]
}
}
}
}
}
},
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A datetime field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["datetime"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":["string"]
},
"time_formats":{
"description":"Formats of times in this field from clj-time",
"optional":true,
"type":"array",
"items":{
"type":"string"
}
},
"summary":{
"type":"object",
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":"integer"
}
}
}
}
}
]
}
{
"description":"A field from the dataset",
"type":"object",
"properties":{
"name":{
"description":"Name for the field",
"optional":true,
"type":"string"
},
"description":{
"description":"Free text description of the field",
"optional":true,
"type":"string"
},
"label":{
"description":"A label for the field (free text for use by clients)",
"optional":true,
"type":"string"
},
"column_number":{
"description":"Column from the data source",
"optional":true,
"type":"integer",
"minimum":0
},
"parent_ids":{
"description":"IDs of the parents if this is a generated field",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"child_ids":{
"description":"IDs of the children if this generates other fields",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"optype":{
"description":"Type describing how the field will be used in the model",
"type":"string",
"optional":true,
},
"auto_generated":{
"description":"True if the field was generated from another field",
"type":"boolean",
"default":false,
"optional":true
},
"datatype":{
"description":"The storage type of the field",
"type":"string",
"optional":true
},
"preferred":{
"description":"Whether the field is used by default for model creation",
"optional":true,
"default":true,
"type":"boolean"
},
"missing_tokens":{
"description":"Overrides global missing tokens",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"locale":{
"description":"Overrides global locale",
"optional":true,
"type":"string"
}
}
}

JSON schemas for models

  • model-schema.json A generic ML model, containing fields shared by most models despite of their concrete type. It uses:
    • sample-schema.json The schema for dataset sampling specifications
    • field-collection-schema.json Auxiliary schema describing a collection of field (or "properties") descriptors
    • generic-field-schema.json Properties shared by all fields, regardless of their type.
    • field-schema.json The union schema of all field descriptor types, with their specific properties.
  • tree-model-schema.json A specialization of the model schema to decision tree models. It uses:
    • node-schema The schema for the nodes in a decision tree
{
"description":"Metadata specifying a generic model",
"type":"object",
"properties":{
"model":{
"type":"object",
"properties":{
"kind":{
"description":"Identifier of this model's kind (e.g., stree)",
"optional":true,
"type":"string"
},
"type":{
"description":"Identifier of this model's type (e.g., regression)",
"optional":true,
"enum":["classification", "regression"]
},
"dataset_id":{
"description":"Identifier of this model's dataset",
"type":"string"
},
"row_range":{
"description":"Rows used to build this model.",
"type":"object",
"optional":true,
"properties":{
"start":{
"type":"number",
"minimum":0
},
"size":{
"type":"number",
"minimum":0
}
}
},
"ordering":{
"enum":["linear", "random", "deterministic"],
"optional":"true",
"default":"deterministic"
},
"sample":{
"optional":true,
"$ref":"sample-schema.json"
},
"locale":{
"description":"Default locale for field values",
"optional":true,
"type":"string"
},
"callbacks":{
"description":"A list of callbacks.",
"optional":true,
"$ref":"callbacks-schema.json"
},
"missing_tokens":{
"description":"Default tokens that represent a missing value",
"type":"array",
"optional":true,
"default":[],
"items":{
"type":"string"
}
},
"fields":{
"description":"Possibly partial list of descriptors for fields
used in the model",
"optional":true,
"$ref":"field-coll-schema.json"
},
"model_fields":{
"description":"Map of descriptors for fields actually appearing
in the model, without summaries",
"optional":true,
"$ref":"field-coll-schema.json"
},
"input_fields":{
"description":"List of input field identifiers (features)",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"excluded_input_fields":{
"description":"List of field identifiers to exclude from input",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"objective_fields":{
"description":"Collection of objective field identifiers (targets)",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"objective_field":{
"description":"Objective field (target) identifier. Gets added to
objective_fields. Useful for models with a single
objective field.",
"optional":true,
"type":"string"
}
}
}
}
}
{
"description":"Sampling specification. If not given,
no sampling is performed.",
"type":"object",
"properties":{
"rate":{
"description":"The rate: fraction of rows we pick.",
"type":"number",
"minimum":0
},
"replace":{
"description":"Whether we sample with replacement or not.",
"type":"boolean",
"optional":true,
"default":false
},
"out_of_bag":{
"description":"Is the sampling out of bag?",
"type":"boolean",
"optional":true,
"default":false
},
"seed":{
"description":"A string to feed the random number generator
used for sampling. The same seed produces
always the same sample (if all other parameters
stay the same). If not specified, we choose a
seed at random.",
"type":"string",
"optional":true
}
}
}
{
"description":"A model consisting of a decision tree",
"type":"object",
"extends":{
"$ref":"model-schema.json"
},
"properties":{
"model":{
"type":"object",
"properties":{
"kind":{
"description":"The kind of tree model.",
"enum":["stree"]
},
"missing_strategy":{
"description":"Action to take on missing data.",
"optional":true,
"enum":["last_prediction"]
},
"split_criterion":{
"description":"Method of choosing best attribute and split point for
a given node.",
"optional":true,
"enum":["information_gain", "information_gain_ratio",
"information_gain_mix", "squared_error",
"squared_error_ratio"]
},
"stat_pruning":{
"description":"Eliminates low confidence leaf nodes from tree using
statistical tests.",
"optional":true,
"default":true,
"type":"boolean"
},
"support_threshold":{
"description":"For a split to be valid, each child's support
(instances / total instances) must be greater than this
threshold.",
"optional":true,
"default":0.001,
"type":"number",
"minimum":0,
"maximum":1
},
"depth_threshold":{
"description":"The depth, or generation, limit for a tree.",
"optional":true,
"default":20,
"type":"integer",
"minimum":1
},
"prune_holdout":{
"description":"DEPRECATED - Pruning with a holdout set.",
"optional":true,
"type":"number",
"minimum":0,
"maximum":1
},
"freeze_threshold":{
"description":"PRIVATE - Once a field histogram exceeds this many
inserts, its bin locations are 'frozen' into place
to improve the performance of future inserts.",
"optional":true,
"default":4096,
"type":"integer",
"minimum":0
},
"objective_histogram_size":{
"description":"PRIVATE - Size of the histograms for capturing the
objective field.",
"optional":true,
"default":32,
"type":"integer"
},
"field_histogram_size":{
"description":"PRIVATE - Size of the histograms for capturing
input fields.",
"optional":true,
"default":64,
"type":"integer"
},
"similarity_threshold":{
"description":"PRIVATE - The threshold for early splitting
(lower requires more similarity).",
"optional":true,
"default":0.15,
"type":"number",
"minimum":0,
"maximum":1
},
"split_score_threshold":{
"description":"PRIVATE - The minimum score required for a split to be
vaild.",
"optional":true,
"default":1.0E-12,
"type":"number",
"minimum":0,
"maximum":1
},
"node_threshold":{
"description":"PRIVATE - The soft limit for number of nodes in
the tree.",
"optional":true,
"default":1024,
"type":"integer"
},
"selective_pruning":{
"description":"PRIVATE - When true, stat pruning will have less effect
for small datasets.",
"optional":true,
"type":"boolean"
},
"z_statistic":{
"description":"PRIVATE - Parameter for stat pruning.",
"optional":true,
"default":2,
"type":"number"
},
"split_early":{
"description":"PRIVATE - Allows early splits.",
"optional":true,
"default":true,
"type":"boolean"
},
"importance":{
"description":"Contains pairs of field ids and importance scores (one
for each input field). The higher the score, the more
the field helps reduce error on the training set.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":"string"},
{"type":"number"}
]
}
},
"root":{
"description":"The root node of the decision tree.",
"optional":true,
"$ref":"node-schema.json"
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment