Coverage for datasette/facets.py : 89%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import json
2import urllib
3import re
4from datasette import hookimpl
5from datasette.database import QueryInterrupted
6from datasette.utils import (
7 escape_sqlite,
8 path_with_added_args,
9 path_with_removed_args,
10 detect_json1,
11 InvalidSql,
12 sqlite3,
13)
16def load_facet_configs(request, table_metadata):
17 # Given a request and the metadata configuration for a table, return
18 # a dictionary of selected facets, their lists of configs and for each
19 # config whether it came from the request or the metadata.
20 #
21 # return {type: [
22 # {"source": "metadata", "config": config1},
23 # {"source": "request", "config": config2}]}
24 facet_configs = {}
25 table_metadata = table_metadata or {}
26 metadata_facets = table_metadata.get("facets", [])
27 for metadata_config in metadata_facets:
28 if isinstance(metadata_config, str):
29 type = "column"
30 metadata_config = {"simple": metadata_config}
31 else:
32 assert (
33 len(metadata_config.values()) == 1
34 ), "Metadata config dicts should be {type: config}"
35 type, metadata_config = metadata_config.items()[0]
36 if isinstance(metadata_config, str):
37 metadata_config = {"simple": metadata_config}
38 facet_configs.setdefault(type, []).append(
39 {"source": "metadata", "config": metadata_config}
40 )
41 qs_pairs = urllib.parse.parse_qs(request.query_string, keep_blank_values=True)
42 for key, values in qs_pairs.items():
43 if key.startswith("_facet"):
44 # Figure out the facet type
45 if key == "_facet":
46 type = "column"
47 elif key.startswith("_facet_"):
48 type = key[len("_facet_") :]
49 for value in values:
50 # The value is the config - either JSON or not
51 if value.startswith("{"):
52 config = json.loads(value)
53 else:
54 config = {"simple": value}
55 facet_configs.setdefault(type, []).append(
56 {"source": "request", "config": config}
57 )
58 return facet_configs
61@hookimpl
62def register_facet_classes():
63 classes = [ColumnFacet, DateFacet]
64 if detect_json1():
65 classes.append(ArrayFacet)
66 return classes
69class Facet:
70 type = None
72 def __init__(
73 self,
74 ds,
75 request,
76 database,
77 sql=None,
78 table=None,
79 params=None,
80 metadata=None,
81 row_count=None,
82 ):
83 assert table or sql, "Must provide either table= or sql="
84 self.ds = ds
85 self.request = request
86 self.database = database
87 # For foreign key expansion. Can be None for e.g. canned SQL queries:
88 self.table = table
89 self.sql = sql or "select * from [{}]".format(table)
90 self.params = params or []
91 self.metadata = metadata
92 # row_count can be None, in which case we calculate it ourselves:
93 self.row_count = row_count
95 def get_configs(self):
96 configs = load_facet_configs(self.request, self.metadata)
97 return configs.get(self.type) or []
99 def get_querystring_pairs(self):
100 # ?_foo=bar&_foo=2&empty= becomes:
101 # [('_foo', 'bar'), ('_foo', '2'), ('empty', '')]
102 return urllib.parse.parse_qsl(self.request.query_string, keep_blank_values=True)
104 async def suggest(self):
105 return []
107 async def facet_results(self):
108 # returns ([results], [timed_out])
109 # TODO: Include "hideable" with each one somehow, which indicates if it was
110 # defined in metadata (in which case you cannot turn it off)
111 raise NotImplementedError
113 async def get_columns(self, sql, params=None):
114 # Detect column names using the "limit 0" trick
115 return (
116 await self.ds.execute(
117 self.database, "select * from ({}) limit 0".format(sql), params or []
118 )
119 ).columns
121 async def get_row_count(self):
122 if self.row_count is None:
123 self.row_count = (
124 await self.ds.execute(
125 self.database,
126 "select count(*) from ({})".format(self.sql),
127 self.params,
128 )
129 ).rows[0][0]
130 return self.row_count
133class ColumnFacet(Facet):
134 type = "column"
136 async def suggest(self):
137 row_count = await self.get_row_count()
138 columns = await self.get_columns(self.sql, self.params)
139 facet_size = self.ds.config("default_facet_size")
140 suggested_facets = []
141 already_enabled = [c["config"]["simple"] for c in self.get_configs()]
142 for column in columns:
143 if column in already_enabled:
144 continue
145 suggested_facet_sql = """
146 select {column}, count(*) as n from (
147 {sql}
148 ) where {column} is not null
149 group by {column}
150 limit {limit}
151 """.format(
152 column=escape_sqlite(column), sql=self.sql, limit=facet_size + 1
153 )
154 distinct_values = None
155 try:
156 distinct_values = await self.ds.execute(
157 self.database,
158 suggested_facet_sql,
159 self.params,
160 truncate=False,
161 custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"),
162 )
163 num_distinct_values = len(distinct_values)
164 if (
165 num_distinct_values
166 and num_distinct_values > 1
167 and num_distinct_values <= facet_size
168 and num_distinct_values < row_count
169 # And at least one has n > 1
170 and any(r["n"] > 1 for r in distinct_values)
171 ):
172 suggested_facets.append(
173 {
174 "name": column,
175 "toggle_url": self.ds.absolute_url(
176 self.request,
177 path_with_added_args(self.request, {"_facet": column}),
178 ),
179 }
180 )
181 except QueryInterrupted:
182 continue
183 return suggested_facets
185 async def facet_results(self):
186 facet_results = {}
187 facets_timed_out = []
189 qs_pairs = self.get_querystring_pairs()
191 facet_size = self.ds.config("default_facet_size")
192 for source_and_config in self.get_configs():
193 config = source_and_config["config"]
194 source = source_and_config["source"]
195 column = config.get("column") or config["simple"]
196 facet_sql = """
197 select {col} as value, count(*) as count from (
198 {sql}
199 )
200 where {col} is not null
201 group by {col} order by count desc, value limit {limit}
202 """.format(
203 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1
204 )
205 try:
206 facet_rows_results = await self.ds.execute(
207 self.database,
208 facet_sql,
209 self.params,
210 truncate=False,
211 custom_time_limit=self.ds.config("facet_time_limit_ms"),
212 )
213 facet_results_values = []
214 facet_results[column] = {
215 "name": column,
216 "type": self.type,
217 "hideable": source != "metadata",
218 "toggle_url": path_with_removed_args(
219 self.request, {"_facet": column}
220 ),
221 "results": facet_results_values,
222 "truncated": len(facet_rows_results) > facet_size,
223 }
224 facet_rows = facet_rows_results.rows[:facet_size]
225 if self.table:
226 # Attempt to expand foreign keys into labels
227 values = [row["value"] for row in facet_rows]
228 expanded = await self.ds.expand_foreign_keys(
229 self.database, self.table, column, values
230 )
231 else:
232 expanded = {}
233 for row in facet_rows:
234 selected = (column, str(row["value"])) in qs_pairs
235 if selected:
236 toggle_path = path_with_removed_args(
237 self.request, {column: str(row["value"])}
238 )
239 else:
240 toggle_path = path_with_added_args(
241 self.request, {column: row["value"]}
242 )
243 facet_results_values.append(
244 {
245 "value": row["value"],
246 "label": expanded.get((column, row["value"]), row["value"]),
247 "count": row["count"],
248 "toggle_url": self.ds.absolute_url(
249 self.request, toggle_path
250 ),
251 "selected": selected,
252 }
253 )
254 except QueryInterrupted:
255 facets_timed_out.append(column)
257 return facet_results, facets_timed_out
260class ArrayFacet(Facet):
261 type = "array"
263 def _is_json_array_of_strings(self, json_string):
264 try:
265 array = json.loads(json_string)
266 except ValueError:
267 return False
268 for item in array:
269 if not isinstance(item, str):
270 return False
271 return True
273 async def suggest(self):
274 columns = await self.get_columns(self.sql, self.params)
275 suggested_facets = []
276 already_enabled = [c["config"]["simple"] for c in self.get_configs()]
277 for column in columns:
278 if column in already_enabled:
279 continue
280 # Is every value in this column either null or a JSON array?
281 suggested_facet_sql = """
282 select distinct json_type({column})
283 from ({sql})
284 """.format(
285 column=escape_sqlite(column), sql=self.sql
286 )
287 try:
288 results = await self.ds.execute(
289 self.database,
290 suggested_facet_sql,
291 self.params,
292 truncate=False,
293 custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"),
294 log_sql_errors=False,
295 )
296 types = tuple(r[0] for r in results.rows)
297 if types in (("array",), ("array", None)):
298 # Now sanity check that first 100 arrays contain only strings
299 first_100 = [
300 v[0]
301 for v in await self.ds.execute(
302 self.database,
303 "select {column} from ({sql}) where {column} is not null and json_array_length({column}) > 0 limit 100".format(
304 column=escape_sqlite(column), sql=self.sql
305 ),
306 self.params,
307 truncate=False,
308 custom_time_limit=self.ds.config(
309 "facet_suggest_time_limit_ms"
310 ),
311 log_sql_errors=False,
312 )
313 ]
314 if first_100 and all(
315 self._is_json_array_of_strings(r) for r in first_100
316 ):
317 suggested_facets.append(
318 {
319 "name": column,
320 "type": "array",
321 "toggle_url": self.ds.absolute_url(
322 self.request,
323 path_with_added_args(
324 self.request, {"_facet_array": column}
325 ),
326 ),
327 }
328 )
329 except (QueryInterrupted, sqlite3.OperationalError):
330 continue
331 return suggested_facets
333 async def facet_results(self):
334 # self.configs should be a plain list of columns
335 facet_results = {}
336 facets_timed_out = []
338 facet_size = self.ds.config("default_facet_size")
339 for source_and_config in self.get_configs():
340 config = source_and_config["config"]
341 source = source_and_config["source"]
342 column = config.get("column") or config["simple"]
343 facet_sql = """
344 select j.value as value, count(*) as count from (
345 {sql}
346 ) join json_each({col}) j
347 group by j.value order by count desc, value limit {limit}
348 """.format(
349 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1
350 )
351 try:
352 facet_rows_results = await self.ds.execute(
353 self.database,
354 facet_sql,
355 self.params,
356 truncate=False,
357 custom_time_limit=self.ds.config("facet_time_limit_ms"),
358 )
359 facet_results_values = []
360 facet_results[column] = {
361 "name": column,
362 "type": self.type,
363 "results": facet_results_values,
364 "hideable": source != "metadata",
365 "toggle_url": path_with_removed_args(
366 self.request, {"_facet_array": column}
367 ),
368 "truncated": len(facet_rows_results) > facet_size,
369 }
370 facet_rows = facet_rows_results.rows[:facet_size]
371 pairs = self.get_querystring_pairs()
372 for row in facet_rows:
373 value = str(row["value"])
374 selected = ("{}__arraycontains".format(column), value) in pairs
375 if selected:
376 toggle_path = path_with_removed_args(
377 self.request, {"{}__arraycontains".format(column): value}
378 )
379 else:
380 toggle_path = path_with_added_args(
381 self.request, {"{}__arraycontains".format(column): value}
382 )
383 facet_results_values.append(
384 {
385 "value": value,
386 "label": value,
387 "count": row["count"],
388 "toggle_url": self.ds.absolute_url(
389 self.request, toggle_path
390 ),
391 "selected": selected,
392 }
393 )
394 except QueryInterrupted:
395 facets_timed_out.append(column)
397 return facet_results, facets_timed_out
400class DateFacet(Facet):
401 type = "date"
403 async def suggest(self):
404 columns = await self.get_columns(self.sql, self.params)
405 already_enabled = [c["config"]["simple"] for c in self.get_configs()]
406 suggested_facets = []
407 for column in columns:
408 if column in already_enabled:
409 continue
410 # Does this column contain any dates in the first 100 rows?
411 suggested_facet_sql = """
412 select date({column}) from (
413 {sql}
414 ) where {column} glob "????-??-*" limit 100;
415 """.format(
416 column=escape_sqlite(column), sql=self.sql
417 )
418 try:
419 results = await self.ds.execute(
420 self.database,
421 suggested_facet_sql,
422 self.params,
423 truncate=False,
424 custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"),
425 log_sql_errors=False,
426 )
427 values = tuple(r[0] for r in results.rows)
428 if any(values):
429 suggested_facets.append(
430 {
431 "name": column,
432 "type": "date",
433 "toggle_url": self.ds.absolute_url(
434 self.request,
435 path_with_added_args(
436 self.request, {"_facet_date": column}
437 ),
438 ),
439 }
440 )
441 except (QueryInterrupted, sqlite3.OperationalError):
442 continue
443 return suggested_facets
445 async def facet_results(self):
446 facet_results = {}
447 facets_timed_out = []
448 args = dict(self.get_querystring_pairs())
449 facet_size = self.ds.config("default_facet_size")
450 for source_and_config in self.get_configs():
451 config = source_and_config["config"]
452 source = source_and_config["source"]
453 column = config.get("column") or config["simple"]
454 # TODO: does this query break if inner sql produces value or count columns?
455 facet_sql = """
456 select date({col}) as value, count(*) as count from (
457 {sql}
458 )
459 where date({col}) is not null
460 group by date({col}) order by count desc, value limit {limit}
461 """.format(
462 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1
463 )
464 try:
465 facet_rows_results = await self.ds.execute(
466 self.database,
467 facet_sql,
468 self.params,
469 truncate=False,
470 custom_time_limit=self.ds.config("facet_time_limit_ms"),
471 )
472 facet_results_values = []
473 facet_results[column] = {
474 "name": column,
475 "type": self.type,
476 "results": facet_results_values,
477 "hideable": source != "metadata",
478 "toggle_url": path_with_removed_args(
479 self.request, {"_facet_date": column}
480 ),
481 "truncated": len(facet_rows_results) > facet_size,
482 }
483 facet_rows = facet_rows_results.rows[:facet_size]
484 for row in facet_rows:
485 selected = str(args.get("{}__date".format(column))) == str(
486 row["value"]
487 )
488 if selected:
489 toggle_path = path_with_removed_args(
490 self.request, {"{}__date".format(column): str(row["value"])}
491 )
492 else:
493 toggle_path = path_with_added_args(
494 self.request, {"{}__date".format(column): row["value"]}
495 )
496 facet_results_values.append(
497 {
498 "value": row["value"],
499 "label": row["value"],
500 "count": row["count"],
501 "toggle_url": self.ds.absolute_url(
502 self.request, toggle_path
503 ),
504 "selected": selected,
505 }
506 )
507 except QueryInterrupted:
508 facets_timed_out.append(column)
510 return facet_results, facets_timed_out