Skip to content

Instantly share code, notes, and snippets.

@aficionado
Last active September 4, 2018 23:49
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save aficionado/4565563 to your computer and use it in GitHub Desktop.
Save aficionado/4565563 to your computer and use it in GitHub Desktop.
JSON PML schemas
{
"description":"A list of callback specifications. They're invoked
upon resource state changes.",
"type":"array",
"items":{
"description":"Each callback specification contains at an HTTP method,
a URL, a list of triggering states and a synchronicity flag.",
"type":"object",
"properties":{
"method":{
"description":"The HTTP method to use.",
"optional":true,
"default":"GET",
"enum":["GET", "POST", "PUT", "DELETE", "get", "post", "put", "delete"],
},
"url":{
"description":"The URL to invoke with the method above.",
"type":"string"
},
"states":{
"description":"A list of states for which the callback should be
called. If not given, the callback will be called
for all state changes.",
"optional":true,
"type":"array",
"items":["string"]
},
"synchronous":{
"description":"A flag specifying wether the callback is synchronous and
we need to wait for a remote response before proceeding.",
"optional":true,
"default":false,
"type":"boolean"
}
}
}
}
{
"description":"A collection of fields",
"type":"object",
"additionalProperties":{"$ref":"field-schema.json"}
}
{
"description":"A field from the dataset",
"type":[
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A text field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["text"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":["string","boolean"]
},
"summary":{
"type":"object",
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":"integer"
}
}
}
}
},
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A categorical field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["categorical"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":["string"]
},
"summary":{
"type":"object",
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":["integer","null"]
},
"categories":{
"description":"The possible categories for categorical fields",
"type":"array",
"items":{
"type":"array",
"items":[
{"type":["string", "number"]},
{"type":["integer","null"]}
]
}
}
}
}
}
},
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A numeric field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["numeric"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":[
"integer","int8","int16","int32","int64",
"float","double","day","month","year","hour",
"minute","second","millisecond","day-of-week","day-of-month"
]
},
"summary":{
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":"integer"
},
"maximum":{
"description":"Maximum value for numeric fields",
"optional":true,
"type":"number"
},
"minimum":{
"description":"Minimum value for numeric fields",
"optional":true,
"type":"number"
},
"median":{
"description":"The approximate median for numeric fields",
"optional":true,
"type":"number"
},
"sum":{
"description":"Sum of values (for mean calculation)",
"type":"number"
},
"sum_squares":{
"description":"Sum of squared values (for variance calculation)",
"type":"number"
},
"population":{
"description":"# of instances containing data for this field",
"type":"integer"
},
"mean":{
"description":"The sample mean for numeric fields",
"optional":true,
"type":"number"
},
"variance":{
"description":"The sample variance for numeric fields",
"optional":true,
"type":"number"
},
"standard_deviation":{
"description":"The sample standard deviation for numeric fields",
"optional":true,
"type":"number"
},
"splits":{
"description":"DEPRECATED - Histogram split points for this field",
"optional":true,
"type":"array",
"items":{
"type":"number"
}
},
"counts":{
"description":"Captures the distribution for the field. Contains
tuples of the unique values and their occurrence counts.
Used when there are 32 or less unique numeric values",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":"number"},
{"type":"integer"}
]
}
},
"bins":{
"description":"Captures the distribution for the field.
Each tuple represents a bin from an approximate
histogram. Each bin contains the bin mean and a
membership count. Used when there are more than 32
unique numeric values.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":"number"},
{"type":"integer"}
]
}
}
}
}
}
},
{
"type":"object",
"extends":{
"$ref":"generic-field-schema.json"
},
"description":"A datetime field from the dataset",
"properties":{
"optype":{
"description":"Operational type of this field",
"optional":true,
"enum":["datetime"]
},
"datatype":{
"description":"The storage type of the field",
"optional":true,
"enum":["string"]
},
"time_formats":{
"description":"Formats of times in this field from clj-time",
"optional":true,
"type":"array",
"items":{
"type":"string"
}
},
"summary":{
"type":"object",
"optional":true,
"properties":{
"missing_count":{
"description":"# of instances missing this field",
"type":"integer"
}
}
}
}
}
]
}
{
"description":"A field from the dataset",
"type":"object",
"properties":{
"name":{
"description":"Name for the field",
"optional":true,
"type":"string"
},
"description":{
"description":"Free text description of the field",
"optional":true,
"type":"string"
},
"label":{
"description":"A label for the field (free text for use by clients)",
"optional":true,
"type":"string"
},
"column_number":{
"description":"Column from the data source",
"optional":true,
"type":"integer",
"minimum":0
},
"parent_ids":{
"description":"IDs of the parents if this is a generated field",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"child_ids":{
"description":"IDs of the children if this generates other fields",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"optype":{
"description":"Type describing how the field will be used in the model",
"type":"string",
"optional":true,
},
"auto_generated":{
"description":"True if the field was generated from another field",
"type":"boolean",
"default":false,
"optional":true
},
"datatype":{
"description":"The storage type of the field",
"type":"string",
"optional":true
},
"preferred":{
"description":"Whether the field is used by default for model creation",
"optional":true,
"default":true,
"type":"boolean"
},
"missing_tokens":{
"description":"Overrides global missing tokens",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"locale":{
"description":"Overrides global locale",
"optional":true,
"type":"string"
}
}
}

JSON PML: schemas for models

  • model-schema.json A generic ML model, containing fields shared by most models despite of their concrete type. It uses:
    • sample-schema.json The schema for dataset sampling specifications
    • field-collection-schema.json Auxiliary schema describing a collection of field (or "properties") descriptors
    • generic-field-schema.json Properties shared by all fields, regardless of their type.
    • field-schema.json The union schema of all field descriptor types, with their specific properties.
  • tree-model-schema.json A specialization of the model schema to decision tree models. It uses:
    • node-schema The schema for the nodes in a decision tree
{
"description":"Metadata specifying a generic model",
"type":"object",
"properties":{
"model":{
"type":"object",
"properties":{
"kind":{
"description":"Identifier of this model's kind (e.g., stree)",
"optional":true,
"type":"string"
},
"type":{
"description":"Identifier of this model's type (e.g., regression)",
"optional":true,
"enum":["classification", "regression"]
},
"dataset_id":{
"description":"Identifier of this model's dataset",
"type":"string"
},
"row_range":{
"description":"Rows used to build this model.",
"type":"object",
"optional":true,
"properties":{
"start":{
"type":"number",
"minimum":0
},
"size":{
"type":"number",
"minimum":0
}
}
},
"ordering":{
"enum":["linear", "random", "deterministic"],
"optional":"true",
"default":"deterministic"
},
"sample":{
"optional":true,
"$ref":"sample-schema.json"
},
"locale":{
"description":"Default locale for field values",
"optional":true,
"type":"string"
},
"callbacks":{
"description":"A list of callbacks.",
"optional":true,
"$ref":"callbacks-schema.json"
},
"missing_tokens":{
"description":"Default tokens that represent a missing value",
"type":"array",
"optional":true,
"default":[],
"items":{
"type":"string"
}
},
"fields":{
"description":"Possibly partial list of descriptors for fields
used in the model",
"optional":true,
"$ref":"field-coll-schema.json"
},
"model_fields":{
"description":"Map of descriptors for fields actually appearing
in the model, without summaries",
"optional":true,
"$ref":"field-coll-schema.json"
},
"input_fields":{
"description":"List of input field identifiers (features)",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"excluded_input_fields":{
"description":"List of field identifiers to exclude from input",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"objective_fields":{
"description":"Collection of objective field identifiers (targets)",
"optional":true,
"type":"array",
"items":{"type":"string"}
},
"objective_field":{
"description":"Objective field (target) identifier. Gets added to
objective_fields. Useful for models with a single
objective field.",
"optional":true,
"type":"string"
}
}
}
}
}
{
"type":"object",
"properties":{
"count":{
"description":"Number of training instances at this node",
"type":"number"
},
"predicate":{
"type":[
"boolean",
{
"type":"object",
"properties":{
"field":{
"description":"Field used for this decision"
"type":"string"
},
"operator":{
"description":"Type of test used for the field"
"type":"string"
},
"value":{
"description":"Field used for this decision"
"type":["number","string"]
}
}
}
]
},
"output":{
"description":"Prediction given at this node",
"type":["number","string"]
},
"confidence":{
"optional":true,
"description":"Probability of correctness for classification and an
estimate of the error for regression.",
"type":["number"],
"minimum":0
},
"objective_summary":{
"type":"object",
"optional":true,
"properties":{
"categories":{
"description":"Captures the distribution for the objective field.
Each tuple contains a category and the category
occurrence count. Used when the objective is
categorical.
For partial models, counts are an estimation and,
as such, they won't be in general integers. But
finished models the pairs will always consist of
a string and an integer.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":["string", "number"]},
{"type":"number"}]
}
},
"counts":{
"description":"Captures the distribution for the objective field.
Contains tuples of the unique values and their
occurrence counts. Used when there are 32 or
less unique numeric values.
During model construction, these counts can be
fractional for partial models. When the model is
finished, however, all counts will be integers.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":["number", "number"],
}
},
"bins":{
"description":"Captures the distribution for the objective field.
Each tuple represents a bin from an approximate
histogram. Each bin contains the bin mean and a
membership count. Used when there are more than
32 unique numeric values
During model construction, the counts can be
fractional for partial models. When the model is
finished, however, all counts will be integers.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":["number", "number"]
}
},
"maximum":{
"description":"Maximum value for numeric fields, used when 'bins'
are present.",
"optional":true,
"type":"number"
},
"minimum":{
"description":"Minimum value for numeric fields, used when 'bins'
are present.",
"optional":true,
"type":"number"
},
}
},
"distribution":{
"description":"DEPRECATED - Distribution of the target at this node",
"optional":true,
"type":"array",
"items":{
"type":"array"
"items":[
{"type":["string", "number"]},
{"type":"number"}
]
}
},
"children":{
"description":"Children of this node",
"optional":true,
"type":"array",
"items":{"$ref":"node-schema.json"},
"minItems":1
}
}
}
{
"description":"Sampling specification. If not given,
no sampling is performed.",
"type":"object",
"properties":{
"rate":{
"description":"The rate: fraction of rows we pick.",
"type":"number",
"minimum":0
},
"replace":{
"description":"Whether we sample with replacement or not.",
"type":"boolean",
"optional":true,
"default":false
},
"out_of_bag":{
"description":"Is the sampling out of bag?",
"type":"boolean",
"optional":true,
"default":false
},
"seed":{
"description":"A string to feed the random number generator
used for sampling. The same seed produces
always the same sample (if all other parameters
stay the same). If not specified, we choose a
seed at random.",
"type":"string",
"optional":true
}
}
}
{
"description":"A model consisting of a decision tree",
"type":"object",
"extends":{
"$ref":"model-schema.json"
},
"properties":{
"model":{
"type":"object",
"properties":{
"kind":{
"description":"The kind of tree model.",
"enum":["stree"]
},
"missing_strategy":{
"description":"Action to take on missing data.",
"optional":true,
"enum":["last_prediction"]
},
"split_criterion":{
"description":"Method of choosing best attribute and split point for
a given node.",
"optional":true,
"enum":["information_gain", "information_gain_ratio",
"information_gain_mix", "squared_error",
"squared_error_ratio"]
},
"stat_pruning":{
"description":"Eliminates low confidence leaf nodes from tree using
statistical tests.",
"optional":true,
"default":true,
"type":"boolean"
},
"support_threshold":{
"description":"For a split to be valid, each child's support
(instances / total instances) must be greater than this
threshold.",
"optional":true,
"default":0.001,
"type":"number",
"minimum":0,
"maximum":1
},
"depth_threshold":{
"description":"The depth, or generation, limit for a tree.",
"optional":true,
"default":20,
"type":"integer",
"minimum":1
},
"prune_holdout":{
"description":"DEPRECATED - Pruning with a holdout set.",
"optional":true,
"type":"number",
"minimum":0,
"maximum":1
},
"freeze_threshold":{
"description":"PRIVATE - Once a field histogram exceeds this many
inserts, its bin locations are 'frozen' into place
to improve the performance of future inserts.",
"optional":true,
"default":4096,
"type":"integer",
"minimum":0
},
"objective_histogram_size":{
"description":"PRIVATE - Size of the histograms for capturing the
objective field.",
"optional":true,
"default":32,
"type":"integer"
},
"field_histogram_size":{
"description":"PRIVATE - Size of the histograms for capturing
input fields.",
"optional":true,
"default":64,
"type":"integer"
},
"similarity_threshold":{
"description":"PRIVATE - The threshold for early splitting
(lower requires more similarity).",
"optional":true,
"default":0.15,
"type":"number",
"minimum":0,
"maximum":1
},
"split_score_threshold":{
"description":"PRIVATE - The minimum score required for a split to be
vaild.",
"optional":true,
"default":1.0E-12,
"type":"number",
"minimum":0,
"maximum":1
},
"node_threshold":{
"description":"PRIVATE - The soft limit for number of nodes in
the tree.",
"optional":true,
"default":1024,
"type":"integer"
},
"selective_pruning":{
"description":"PRIVATE - When true, stat pruning will have less effect
for small datasets.",
"optional":true,
"type":"boolean"
},
"z_statistic":{
"description":"PRIVATE - Parameter for stat pruning.",
"optional":true,
"default":2,
"type":"number"
},
"split_early":{
"description":"PRIVATE - Allows early splits.",
"optional":true,
"default":true,
"type":"boolean"
},
"importance":{
"description":"Contains pairs of field ids and importance scores (one
for each input field). The higher the score, the more
the field helps reduce error on the training set.",
"optional":true,
"type":"array",
"items":{
"type":"array",
"items":[
{"type":"string"},
{"type":"number"}
]
}
},
"root":{
"description":"The root node of the decision tree.",
"optional":true,
"$ref":"node-schema.json"
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment