JSON Schemas¶
Query¶
JSON objects expected by the search endpoint.
1{
2 "$schema": "http://json-schema.org/draft-07/schema#",
3 "title": "Query",
4 "description": "JSON object that specifies queries for searching datasets in Auctus.",
5 "type": "object",
6 "properties": {
7 "keywords": {
8 "description": "Keywords that match a dataset. The keywords can be matched against the dataset title, dataset description, dataset column names, etc.",
9 "oneOf":[
10 {
11 "type": "string"
12 },
13 {
14 "type": "array",
15 "items": {
16 "type": "string"
17 }
18 }
19 ]
20 },
21 "source": {
22 "description": "Source or sources that results should come from.",
23 "oneOf": [
24 {
25 "type": "string"
26 },
27 {
28 "type": "array",
29 "items": {
30 "type": "string"
31 }
32 }
33 ]
34 },
35 "types": {
36 "description": "Types of datasets we are searching for.",
37 "oneOf": [
38 {
39 "$ref": "#/definitions/dataset_type"
40 },
41 {
42 "type": "array",
43 "items": {
44 "$ref": "#/definitions/dataset_type"
45 }
46 }
47 ]
48 },
49 "augmentation_type": {
50 "type": "string",
51 "description": "Type of augmentation with the input data, ie join or union.",
52 "enum": [
53 "join",
54 "union"
55 ]
56 },
57 "variables": {
58 "type": "array",
59 "description": "Describes a set of features (variables) that a matching dataset must have. Datasets with more features will be ranked higher.",
60 "items": {
61 "oneOf": [
62 {
63 "$ref": "#/definitions/temporal_variable"
64 },
65 {
66 "$ref": "#/definitions/geospatial_variable"
67 },
68 {
69 "$ref": "#/definitions/tabular_variable"
70 },
71 {
72 "$ref": "#/definitions/named_entity_variable"
73 }
74 ]
75 }
76 }
77 },
78 "definitions": {
79 "dataset_type": {
80 "type": "string",
81 "description": "Types of datasets.",
82 "enum": [
83 "numerical",
84 "categorical",
85 "spatial",
86 "temporal"
87 ]
88 },
89 "temporal_variable": {
90 "type": "object",
91 "description": "Describes columns containing temporal information.",
92 "properties": {
93 "type": {
94 "type": "string",
95 "enum": [
96 "temporal_variable"
97 ]
98 },
99 "start": {
100 "type": "string",
101 "description": "Requested dates are more recent than this date."
102 },
103 "end": {
104 "type": "string",
105 "description": "Requested dates are older than this date."
106 },
107 "granularity": {
108 "type": "string",
109 "description": "Requested dates should match the requested granularity. For example, if 'day' is requested, the best match is a dataset with dates; however a dataset with hours is relevant too as hourly data can be aggregated into days.",
110 "enum": [
111 "year",
112 "quarter",
113 "month",
114 "week",
115 "day",
116 "hour",
117 "minute",
118 "second"
119 ]
120 }
121 },
122 "required": [
123 "type"
124 ]
125 },
126 "geospatial_variable": {
127 "type": "object",
128 "description": "Describes columns containing geospatial entities.",
129 "properties": {
130 "type": {
131 "type": "string",
132 "enum": [
133 "geospatial_variable"
134 ]
135 },
136 "area_name":{
137 "type": "string",
138 "description": "A named administrative area."
139 },
140 "latitude1":{
141 "type": "number",
142 "description": "The latitude of the top left point."
143 },
144 "longitude1":{
145 "type": "number",
146 "description": "The longitude of the top left point."
147 },
148 "latitude2":{
149 "type": "number",
150 "description": "The latitude of the bottom right point."
151 },
152 "longitude2":{
153 "type": "number",
154 "description": "The longitude of the bottom right point."
155 },
156 "granularity": {
157 "type": "string",
158 "description": "The granularity of the entities contained in a bounding box.",
159 "enum": [
160 "country",
161 "state",
162 "city",
163 "county",
164 "postal_code"
165 ]
166 }
167 },
168 "required": [
169 "type"
170 ]
171 },
172 "tabular_variable": {
173 "type": "object",
174 "description": "Describe columns that a matching dataset should have in terms of columns of the supplied dataset.",
175 "properties": {
176 "type": {
177 "type": "string",
178 "enum": [
179 "dataframe_variable"
180 ]
181 },
182 "columns": {
183 "type": "array",
184 "description": "A set of indices that identifies a set of columns in the supplied dataset. When multiple indices are provided, the matching dataset should contain columns corresponding to each of the given columns.",
185 "items": {
186 "type": "integer"
187 }
188 },
189 "relationship": {
190 "type": "string",
191 "description": "The relationship between a column in the supplied dataset and a column in a matching dataset. The default is 'contains'.",
192 "enum": [
193 "contains",
194 "similar",
195 "correlated",
196 "anti-correlated",
197 "mutually-informative",
198 "mutually-uninformative"
199 ]
200 }
201 },
202 "required": [
203 "type"
204 ]
205 },
206 "named_entity_variable": {
207 "type": "object",
208 "description": "Describes a set of named entities that a matching dataset must contain.",
209 "properties": {
210 "type": {
211 "type": "string",
212 "enum": [
213 "named_entity_variable"
214 ]
215 },
216 "entities": {
217 "type": "array",
218 "description": "A set of entity names. A matching dataset should contain a column with the requested names.",
219 "items": {
220 "type": "string"
221 }
222 }
223 },
224 "required": [
225 "type"
226 ]
227 }
228 }
229}
Result schema¶
Description of a dataset, such as a search result. The search endpoint returns an array of those. They are also what you give the datamart_materialize.download()
.
1{
2 "$schema": "http://json-schema.org/draft-07/schema#",
3 "title": "Result",
4 "type": "object",
5 "properties": {
6 "id": {
7 "type": "string",
8 "description": "The dataset identifier provided by Auctus."
9 },
10 "score": {
11 "type": "number",
12 "description": "A non-negative number that represents the relevance of this dataset to query. Larger scores indicate better matches."
13 },
14 "metadata": {
15 "$ref": "#/definitions/metadata"
16 },
17 "augmentation": {
18 "type": "object",
19 "description": "The augmentation suggested by Auctus.",
20 "properties": {
21 "type": {
22 "type": "string",
23 "enum": [
24 "join",
25 "union",
26 "none"
27 ]
28 },
29 "left_columns": {
30 "type": "array",
31 "description": "The left-side columns for the augmentation, which correspond to the supplied dataset.",
32 "items": {
33 "$ref": "#/definitions/augmentation_unit"
34 }
35 },
36 "right_columns": {
37 "type": "array",
38 "description": "The right-side columns for the augmentation, which correspond to the Auctus dataset.",
39 "items": {
40 "$ref": "#/definitions/augmentation_unit"
41 }
42 },
43 "left_columns_names": {
44 "type": "array",
45 "description": "The names of the left-side columns, for information.",
46 "items": {
47 "type": "array",
48 "items": {
49 "type": "string"
50 }
51 }
52 },
53 "right_columns_names": {
54 "type": "array",
55 "description": "The names of the right-side columns, for information.",
56 "items": {
57 "type": "array",
58 "items": {
59 "type": "string"
60 }
61 }
62 },
63 "temporal_resolution": {
64 "$ref": "#/definitions/temporal_resolution"
65 },
66 "agg_functions": {
67 "type": "object",
68 "description": "Maps column names to aggregation function(s) names.",
69 "additionalProperties": {
70 "oneOf": [
71 {
72 "type": "string"
73 },
74 {
75 "type": "array",
76 "items": {
77 "type": "string"
78 }
79 }
80 ]
81 }
82 }
83 },
84 "required": [
85 "type",
86 "left_columns",
87 "right_columns"
88 ],
89 "additionalProperties": false
90 }
91 },
92 "required": ["id", "score", "metadata"],
93 "additionalProperties": true,
94 "definitions": {
95 "metadata": {
96 "type": "object",
97 "description": "The metadata associated with the dataset.",
98 "properties": {
99 "name": {"type": "string"},
100 "description": {"type": "string"},
101 "source": {"type": "string"},
102 "date": {"type": "string"},
103 "license": {"type": "string"},
104 "sample": {
105 "type": "string",
106 "description": "A random sample in CSV format"
107 },
108 "types": {
109 "type": "array",
110 "items": {
111 "type": "string",
112 "enum": [
113 "spatial",
114 "temporal",
115 "numerical",
116 "categorical"
117 ]
118 }
119 },
120 "size": {
121 "type": "integer",
122 "description": "Size of the file in bytes"
123 },
124 "nb_rows": {
125 "type": "integer",
126 "description": "Total number of rows in the file"
127 },
128 "nb_profiled_rows": {
129 "type": "integer",
130 "description": "Number of rows in the sample that was profiled. If the dataset is too big, this will be less than 'nb_rows'"
131 },
132 "version": {
133 "type": "string",
134 "description": "Version number of the profiler which generated this record"
135 },
136 "columns": {
137 "type": "array",
138 "items": {
139 "type": "object",
140 "properties": {
141 "name": {"type": "string"},
142 "structural_type": {"type": "string"},
143 "semantic_types": {
144 "type": "array",
145 "items": {"type": "string"}
146 },
147 "missing_values_ratio": {"type": "number"},
148 "unclean_values_ratio": {"type": "number"},
149 "num_distinct_values": {"type": "integer"},
150 "temporal_resolution": {
151 "$ref": "#/definitions/temporal_resolution"
152 },
153 "admin_area_level": {
154 "type": "integer"
155 },
156 "point_format": {
157 "type": "string",
158 "enum": [
159 "lat,long",
160 "long,lat"
161 ]
162 },
163 "mean": {"type": "number"},
164 "stddev": {"type": "number"},
165 "coverage": {
166 "type": "array",
167 "items": {
168 "type": "object",
169 "properties": {
170 "range": {
171 "type": "object",
172 "properties": {
173 "gte": {"type": "number"},
174 "lte": {"type": "number"}
175 },
176 "required": ["gte", "lte"],
177 "additionalProperties": false
178 }
179 },
180 "required": ["range"],
181 "additionalProperties": false
182 }
183 },
184 "plot": {
185 "$ref": "#/definitions/plot"
186 }
187 },
188 "additionalProperties": false
189 }
190 },
191 "spatial_coverage": {
192 "type": "array",
193 "items": {
194 "type": "object",
195 "description": "Keep in sync, search code for 279a32",
196 "properties": {
197 "type": {"type": "string"},
198 "column_names": {
199 "type": "array",
200 "items": {"type": "string"}
201 },
202 "column_indexes": {
203 "type": "array",
204 "items": {"type": "number"}
205 },
206 "ranges": {
207 "$ref": "#/definitions/spatial_ranges"
208 },
209 "geohashes4": {
210 "type": "array",
211 "items": {
212 "type": "object",
213 "properties": {
214 "hash": {"type": "string"},
215 "number": {"type": "integer"}
216 }
217 }
218 },
219 "number": {
220 "type": "integer",
221 "description": "Number of known locations from which the coverage was generated"
222 }
223 },
224 "required": ["type", "column_names", "column_indexes", "ranges"],
225 "additionalProperties": false
226 }
227 },
228 "temporal_coverage": {
229 "type": "array",
230 "items": {
231 "type": "object",
232 "properties": {
233 "type": {"type": "string"},
234 "column_names": {
235 "type": "array",
236 "items": {"type": "string"}
237 },
238 "column_indexes": {
239 "type": "array",
240 "items": {"type": "number"}
241 },
242 "column_types": {
243 "type": "array",
244 "items": {"type": "string"}
245 },
246 "temporal_resolution": {"type": "string"},
247 "ranges": {
248 "type": "array",
249 "items": {
250 "type": "object",
251 "properties": {
252 "lte": {"type": "number"},
253 "gte": {"type": "number"}
254 }
255 }
256 }
257 },
258 "required": ["type", "column_names", "column_indexes", "column_types", "ranges"],
259 "additionalProperties": false
260 }
261 }
262 },
263 "required": ["name"]
264 },
265 "augmentation_unit": {
266 "type": "array",
267 "description": "An array of column identifiers that represents an unit for the augmentation. In the simplest case, we have a single identifier, and in more complex cases, we have multiple identifiers (e.g.: columns 'latitude' and 'longitude' could be combined to join and/or concatenate with column 'country').",
268 "items": {
269 "type": "integer"
270 }
271 },
272 "temporal_resolution": {
273 "type": "string",
274 "enum": [
275 "year",
276 "month",
277 "week",
278 "day",
279 "hour",
280 "minute",
281 "second"
282 ]
283 },
284 "spatial_ranges": {
285 "type": "array",
286 "items": {
287 "type": "object",
288 "properties": {
289 "range": {
290 "type": "object",
291 "properties": {
292 "type": {
293 "type": "string",
294 "enum": ["envelope"]
295 },
296 "coordinates": {
297 "type": "array",
298 "items": {
299 "type": "array",
300 "items": {"type": "number"}
301 }
302 }
303 },
304 "required": ["type", "coordinates"],
305 "additionalProperties": false
306 }
307 },
308 "required": ["range"],
309 "additionalProperties": false
310 }
311 },
312 "plot": {
313 "description": "Plot data meant for display",
314 "oneOf": [
315 {
316 "type": "object",
317 "properties": {
318 "type": {
319 "type": "string",
320 "enum": ["histogram_numerical"]
321 },
322 "data": {
323 "type": "array",
324 "items": {
325 "type": "object",
326 "properties": {
327 "count": {"type": "integer"},
328 "bin_start": {"type": "number"},
329 "bin_end": {"type": "number"}
330 },
331 "required": ["count", "bin_start", "bin_end"]
332 }
333 }
334 },
335 "required": ["type", "data"],
336 "additionalProperties": false
337 },
338 {
339 "type": "object",
340 "properties": {
341 "type": {
342 "type": "string",
343 "enum": ["histogram_temporal"]
344 },
345 "data": {
346 "type": "array",
347 "items": {
348 "type": "object",
349 "properties": {
350 "count": {"type": "integer"},
351 "date_start": {"type": "string"},
352 "date_end": {"type": "string"}
353 },
354 "required": ["count", "date_start", "date_end"]
355 }
356 }
357 },
358 "required": ["type", "data"],
359 "additionalProperties": false
360 },
361 {
362 "type": "object",
363 "properties": {
364 "type": {
365 "type": "string",
366 "enum": [
367 "histogram_categorical",
368 "histogram_text"
369 ]
370 },
371 "data": {
372 "type": "array",
373 "items": {
374 "type": "object",
375 "properties": {
376 "count": {"type": "integer"},
377 "bin": {"type": "string"}
378 },
379 "required": ["count", "bin"]
380 }
381 }
382 },
383 "required": ["type", "data"],
384 "additionalProperties": false
385 }
386 ]
387 }
388 }
389}