JSON Schemas

Query

JSON objects expected by the search endpoint.

  1{
  2  "$schema": "http://json-schema.org/draft-07/schema#",
  3  "title": "Query",
  4  "description": "JSON object that specifies queries for searching datasets in Auctus.",
  5  "type": "object",
  6  "properties": {
  7    "keywords": {
  8      "description": "Keywords that match a dataset. The keywords can be matched against the dataset title, dataset description, dataset column names, etc.",
  9      "oneOf":[
 10        {
 11          "type": "string"
 12        },
 13        {
 14          "type": "array",
 15          "items": {
 16            "type": "string"
 17          }
 18        }
 19      ]
 20    },
 21    "source": {
 22      "description": "Source or sources that results should come from.",
 23      "oneOf": [
 24        {
 25          "type": "string"
 26        },
 27        {
 28          "type": "array",
 29          "items": {
 30            "type": "string"
 31          }
 32        }
 33      ]
 34    },
 35    "types": {
 36      "description": "Types of datasets we are searching for.",
 37      "oneOf": [
 38        {
 39          "$ref": "#/definitions/dataset_type"
 40        },
 41        {
 42          "type": "array",
 43          "items": {
 44            "$ref": "#/definitions/dataset_type"
 45          }
 46        }
 47      ]
 48    },
 49    "augmentation_type": {
 50      "type": "string",
 51      "description": "Type of augmentation with the input data, ie join or union.",
 52      "enum": [
 53        "join",
 54        "union"
 55      ]
 56    },
 57    "variables": {
 58      "type": "array",
 59      "description": "Describes a set of features (variables) that a matching dataset must have. Datasets with more features will be ranked higher.",
 60      "items": {
 61        "oneOf": [
 62          {
 63            "$ref": "#/definitions/temporal_variable"
 64          },
 65          {
 66            "$ref": "#/definitions/geospatial_variable"
 67          },
 68          {
 69            "$ref": "#/definitions/tabular_variable"
 70          },
 71          {
 72            "$ref": "#/definitions/named_entity_variable"
 73          }
 74        ]
 75      }
 76    }
 77  },
 78  "definitions": {
 79    "dataset_type": {
 80      "type": "string",
 81      "description": "Types of datasets.",
 82      "enum": [
 83        "numerical",
 84        "categorical",
 85        "spatial",
 86        "temporal"
 87      ]
 88    },
 89    "temporal_variable": {
 90      "type": "object",
 91      "description": "Describes columns containing temporal information.",
 92      "properties": {
 93        "type": {
 94          "type": "string",
 95          "enum": [
 96            "temporal_variable"
 97          ]
 98        },
 99        "start": {
100          "type": "string",
101          "description": "Requested dates are more recent than this date."
102        },
103        "end": {
104          "type": "string",
105          "description": "Requested dates are older than this date."
106        },
107        "granularity": {
108          "type": "string",
109          "description": "Requested dates should match the requested granularity. For example, if 'day' is requested, the best match is a dataset with dates; however a dataset with hours is relevant too as hourly data can be aggregated into days.",
110          "enum": [
111            "year",
112            "quarter",
113            "month",
114            "week",
115            "day",
116            "hour",
117            "minute",
118            "second"
119          ]
120        }
121      },
122      "required": [
123        "type"
124      ]
125    },
126    "geospatial_variable": {
127      "type": "object",
128      "description": "Describes columns containing geospatial entities.",
129      "properties": {
130        "type": {
131          "type": "string",
132          "enum": [
133            "geospatial_variable"
134          ]
135        },
136        "area_name":{
137          "type": "string",
138          "description": "A named administrative area."
139        },
140        "latitude1":{
141          "type": "number",
142          "description": "The latitude of the top left point."
143        },
144        "longitude1":{
145          "type": "number",
146          "description": "The longitude of the top left point."
147        },
148        "latitude2":{
149          "type": "number",
150          "description": "The latitude of the bottom right point."
151        },
152        "longitude2":{
153          "type": "number",
154          "description": "The longitude of the bottom right point."
155        },
156        "granularity": {
157          "type": "string",
158          "description": "The granularity of the entities contained in a bounding box.",
159          "enum": [
160            "country",
161            "state",
162            "city",
163            "county",
164            "postal_code"
165          ]
166        }
167      },
168      "required": [
169        "type"
170      ]
171    },
172    "tabular_variable": {
173      "type": "object",
174      "description": "Describe columns that a matching dataset should have in terms of columns of the supplied dataset.",
175      "properties": {
176        "type": {
177          "type": "string",
178          "enum": [
179            "dataframe_variable"
180          ]
181        },
182        "columns": {
183          "type": "array",
184          "description": "A set of indices that identifies a set of columns in the supplied dataset. When multiple indices are provided, the matching dataset should contain columns corresponding to each of the given columns.",
185          "items": {
186            "type": "integer"
187          }
188        },
189        "relationship": {
190          "type": "string",
191          "description": "The relationship between a column in the supplied dataset and a column in a matching dataset. The default is 'contains'.",
192          "enum": [
193            "contains",
194            "similar",
195            "correlated",
196            "anti-correlated",
197            "mutually-informative",
198            "mutually-uninformative"
199          ]
200        }
201      },
202      "required": [
203        "type"
204      ]
205    },
206    "named_entity_variable": {
207      "type": "object",
208      "description": "Describes a set of named entities that a matching dataset must contain.",
209      "properties": {
210        "type": {
211          "type": "string",
212          "enum": [
213            "named_entity_variable"
214          ]
215        },
216        "entities": {
217          "type": "array",
218          "description": "A set of entity names. A matching dataset should contain a column with the requested names.",
219          "items": {
220              "type": "string"
221          }
222        }
223      },
224      "required": [
225        "type"
226      ]
227    }
228  }
229}

Result schema

Description of a dataset, such as a search result. The search endpoint returns an array of those. They are also what you give the datamart_materialize.download().

  1{
  2  "$schema": "http://json-schema.org/draft-07/schema#",
  3  "title": "Result",
  4  "type": "object",
  5  "properties": {
  6    "id": {
  7      "type": "string",
  8      "description": "The dataset identifier provided by Auctus."
  9    },
 10    "score": {
 11      "type": "number",
 12      "description": "A non-negative number that represents the relevance of this dataset to query. Larger scores indicate better matches."
 13    },
 14    "metadata": {
 15      "$ref": "#/definitions/metadata"
 16    },
 17    "augmentation": {
 18      "type": "object",
 19      "description": "The augmentation suggested by Auctus.",
 20      "properties": {
 21        "type": {
 22          "type": "string",
 23          "enum": [
 24            "join",
 25            "union",
 26            "none"
 27          ]
 28        },
 29        "left_columns": {
 30          "type": "array",
 31          "description": "The left-side columns for the augmentation, which correspond to the supplied dataset.",
 32          "items": {
 33            "$ref": "#/definitions/augmentation_unit"
 34          }
 35        },
 36        "right_columns": {
 37          "type": "array",
 38          "description": "The right-side columns for the augmentation, which correspond to the Auctus dataset.",
 39          "items": {
 40            "$ref": "#/definitions/augmentation_unit"
 41          }
 42        },
 43        "left_columns_names": {
 44          "type": "array",
 45          "description": "The names of the left-side columns, for information.",
 46          "items": {
 47            "type": "array",
 48            "items": {
 49              "type": "string"
 50            }
 51          }
 52        },
 53        "right_columns_names": {
 54          "type": "array",
 55          "description": "The names of the right-side columns, for information.",
 56          "items": {
 57            "type": "array",
 58            "items": {
 59              "type": "string"
 60            }
 61          }
 62        },
 63        "temporal_resolution": {
 64          "$ref": "#/definitions/temporal_resolution"
 65        },
 66        "agg_functions": {
 67          "type": "object",
 68          "description": "Maps column names to aggregation function(s) names.",
 69          "additionalProperties": {
 70            "oneOf": [
 71              {
 72                "type": "string"
 73              },
 74              {
 75                "type": "array",
 76                "items": {
 77                  "type": "string"
 78                }
 79              }
 80            ]
 81          }
 82        }
 83      },
 84      "required": [
 85        "type",
 86        "left_columns",
 87        "right_columns"
 88      ],
 89      "additionalProperties": false
 90    }
 91  },
 92  "required": ["id", "score", "metadata"],
 93  "additionalProperties": true,
 94  "definitions": {
 95    "metadata": {
 96      "type": "object",
 97      "description": "The metadata associated with the dataset.",
 98      "properties": {
 99        "name": {"type": "string"},
100        "description": {"type": "string"},
101        "source": {"type": "string"},
102        "date": {"type": "string"},
103        "license": {"type": "string"},
104        "sample": {
105          "type": "string",
106          "description": "A random sample in CSV format"
107        },
108        "types": {
109          "type": "array",
110          "items": {
111            "type": "string",
112            "enum": [
113              "spatial",
114              "temporal",
115              "numerical",
116              "categorical"
117            ]
118          }
119        },
120        "size": {
121          "type": "integer",
122          "description": "Size of the file in bytes"
123        },
124        "nb_rows": {
125          "type": "integer",
126          "description": "Total number of rows in the file"
127        },
128        "nb_profiled_rows": {
129          "type": "integer",
130          "description": "Number of rows in the sample that was profiled. If the dataset is too big, this will be less than 'nb_rows'"
131        },
132        "version": {
133          "type": "string",
134          "description": "Version number of the profiler which generated this record"
135        },
136        "columns": {
137          "type": "array",
138          "items": {
139            "type": "object",
140            "properties": {
141              "name": {"type": "string"},
142              "structural_type": {"type": "string"},
143              "semantic_types": {
144                "type": "array",
145                "items": {"type": "string"}
146              },
147              "missing_values_ratio": {"type": "number"},
148              "unclean_values_ratio": {"type": "number"},
149              "num_distinct_values": {"type": "integer"},
150              "temporal_resolution": {
151                "$ref": "#/definitions/temporal_resolution"
152              },
153              "admin_area_level": {
154                "type": "integer"
155              },
156              "point_format": {
157                "type": "string",
158                "enum": [
159                  "lat,long",
160                  "long,lat"
161                ]
162              },
163              "mean": {"type": "number"},
164              "stddev": {"type": "number"},
165              "coverage": {
166                "type": "array",
167                "items": {
168                  "type": "object",
169                  "properties": {
170                    "range": {
171                      "type": "object",
172                      "properties": {
173                        "gte": {"type": "number"},
174                        "lte": {"type": "number"}
175                      },
176                      "required": ["gte", "lte"],
177                      "additionalProperties": false
178                    }
179                  },
180                  "required": ["range"],
181                  "additionalProperties": false
182                }
183              },
184              "plot": {
185                "$ref": "#/definitions/plot"
186              }
187            },
188            "additionalProperties": false
189          }
190        },
191        "spatial_coverage": {
192          "type": "array",
193          "items": {
194            "type": "object",
195            "description": "Keep in sync, search code for 279a32",
196            "properties": {
197              "type": {"type": "string"},
198              "column_names": {
199                "type": "array",
200                "items": {"type": "string"}
201              },
202              "column_indexes": {
203                "type": "array",
204                "items": {"type": "number"}
205              },
206              "ranges": {
207                "$ref": "#/definitions/spatial_ranges"
208              },
209              "geohashes4": {
210                "type": "array",
211                "items": {
212                  "type": "object",
213                  "properties": {
214                    "hash": {"type": "string"},
215                    "number": {"type": "integer"}
216                  }
217                }
218              },
219              "number": {
220                "type": "integer",
221                "description": "Number of known locations from which the coverage was generated"
222              }
223            },
224            "required": ["type", "column_names", "column_indexes", "ranges"],
225            "additionalProperties": false
226          }
227        },
228        "temporal_coverage": {
229          "type": "array",
230          "items": {
231            "type": "object",
232            "properties": {
233              "type": {"type": "string"},
234              "column_names": {
235                "type": "array",
236                "items": {"type": "string"}
237              },
238              "column_indexes": {
239                "type": "array",
240                "items": {"type": "number"}
241              },
242              "column_types": {
243                "type": "array",
244                "items": {"type": "string"}
245              },
246              "temporal_resolution": {"type": "string"},
247              "ranges": {
248                "type": "array",
249                "items": {
250                  "type": "object",
251                  "properties": {
252                    "lte": {"type": "number"},
253                    "gte": {"type": "number"}
254                  }
255                }
256              }
257            },
258            "required": ["type", "column_names", "column_indexes", "column_types", "ranges"],
259            "additionalProperties": false
260          }
261        }
262      },
263      "required": ["name"]
264    },
265    "augmentation_unit": {
266      "type": "array",
267      "description": "An array of column identifiers that represents an unit for the augmentation. In the simplest case, we have a single identifier, and in more complex cases, we have multiple identifiers (e.g.: columns 'latitude' and 'longitude' could be combined to join and/or concatenate with column 'country').",
268      "items": {
269        "type": "integer"
270      }
271    },
272    "temporal_resolution": {
273      "type": "string",
274      "enum": [
275        "year",
276        "month",
277        "week",
278        "day",
279        "hour",
280        "minute",
281        "second"
282      ]
283    },
284    "spatial_ranges": {
285      "type": "array",
286      "items": {
287        "type": "object",
288        "properties": {
289          "range": {
290            "type": "object",
291            "properties": {
292              "type": {
293                "type": "string",
294                "enum": ["envelope"]
295              },
296              "coordinates": {
297                "type": "array",
298                "items": {
299                  "type": "array",
300                  "items": {"type": "number"}
301                }
302              }
303            },
304            "required": ["type", "coordinates"],
305            "additionalProperties": false
306          }
307        },
308        "required": ["range"],
309        "additionalProperties": false
310      }
311    },
312    "plot": {
313      "description": "Plot data meant for display",
314      "oneOf": [
315        {
316          "type": "object",
317          "properties": {
318            "type": {
319              "type": "string",
320              "enum": ["histogram_numerical"]
321            },
322            "data": {
323              "type": "array",
324              "items": {
325                "type": "object",
326                "properties": {
327                  "count": {"type": "integer"},
328                  "bin_start": {"type": "number"},
329                  "bin_end": {"type": "number"}
330                },
331                "required": ["count", "bin_start", "bin_end"]
332              }
333            }
334          },
335          "required": ["type", "data"],
336          "additionalProperties": false
337        },
338        {
339          "type": "object",
340          "properties": {
341            "type": {
342              "type": "string",
343              "enum": ["histogram_temporal"]
344            },
345            "data": {
346              "type": "array",
347              "items": {
348                "type": "object",
349                "properties": {
350                  "count": {"type": "integer"},
351                  "date_start": {"type": "string"},
352                  "date_end": {"type": "string"}
353                },
354                "required": ["count", "date_start", "date_end"]
355              }
356            }
357          },
358          "required": ["type", "data"],
359          "additionalProperties": false
360        },
361        {
362          "type": "object",
363          "properties": {
364            "type": {
365              "type": "string",
366              "enum": [
367                "histogram_categorical",
368                "histogram_text"
369              ]
370            },
371            "data": {
372              "type": "array",
373              "items": {
374                "type": "object",
375                "properties": {
376                  "count": {"type": "integer"},
377                  "bin": {"type": "string"}
378                },
379                "required": ["count", "bin"]
380              }
381            }
382          },
383          "required": ["type", "data"],
384          "additionalProperties": false
385        }
386      ]
387    }
388  }
389}