Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import json 

2import urllib 

3import re 

4from datasette import hookimpl 

5from datasette.database import QueryInterrupted 

6from datasette.utils import ( 

7 escape_sqlite, 

8 path_with_added_args, 

9 path_with_removed_args, 

10 detect_json1, 

11 InvalidSql, 

12 sqlite3, 

13) 

14 

15 

16def load_facet_configs(request, table_metadata): 

17 # Given a request and the metadata configuration for a table, return 

18 # a dictionary of selected facets, their lists of configs and for each 

19 # config whether it came from the request or the metadata. 

20 # 

21 # return {type: [ 

22 # {"source": "metadata", "config": config1}, 

23 # {"source": "request", "config": config2}]} 

24 facet_configs = {} 

25 table_metadata = table_metadata or {} 

26 metadata_facets = table_metadata.get("facets", []) 

27 for metadata_config in metadata_facets: 

28 if isinstance(metadata_config, str): 

29 type = "column" 

30 metadata_config = {"simple": metadata_config} 

31 else: 

32 assert ( 

33 len(metadata_config.values()) == 1 

34 ), "Metadata config dicts should be {type: config}" 

35 type, metadata_config = metadata_config.items()[0] 

36 if isinstance(metadata_config, str): 

37 metadata_config = {"simple": metadata_config} 

38 facet_configs.setdefault(type, []).append( 

39 {"source": "metadata", "config": metadata_config} 

40 ) 

41 qs_pairs = urllib.parse.parse_qs(request.query_string, keep_blank_values=True) 

42 for key, values in qs_pairs.items(): 

43 if key.startswith("_facet"): 

44 # Figure out the facet type 

45 if key == "_facet": 

46 type = "column" 

47 elif key.startswith("_facet_"): 

48 type = key[len("_facet_") :] 

49 for value in values: 

50 # The value is the config - either JSON or not 

51 if value.startswith("{"): 

52 config = json.loads(value) 

53 else: 

54 config = {"simple": value} 

55 facet_configs.setdefault(type, []).append( 

56 {"source": "request", "config": config} 

57 ) 

58 return facet_configs 

59 

60 

61@hookimpl 

62def register_facet_classes(): 

63 classes = [ColumnFacet, DateFacet] 

64 if detect_json1(): 

65 classes.append(ArrayFacet) 

66 return classes 

67 

68 

69class Facet: 

70 type = None 

71 

72 def __init__( 

73 self, 

74 ds, 

75 request, 

76 database, 

77 sql=None, 

78 table=None, 

79 params=None, 

80 metadata=None, 

81 row_count=None, 

82 ): 

83 assert table or sql, "Must provide either table= or sql=" 

84 self.ds = ds 

85 self.request = request 

86 self.database = database 

87 # For foreign key expansion. Can be None for e.g. canned SQL queries: 

88 self.table = table 

89 self.sql = sql or "select * from [{}]".format(table) 

90 self.params = params or [] 

91 self.metadata = metadata 

92 # row_count can be None, in which case we calculate it ourselves: 

93 self.row_count = row_count 

94 

95 def get_configs(self): 

96 configs = load_facet_configs(self.request, self.metadata) 

97 return configs.get(self.type) or [] 

98 

99 def get_querystring_pairs(self): 

100 # ?_foo=bar&_foo=2&empty= becomes: 

101 # [('_foo', 'bar'), ('_foo', '2'), ('empty', '')] 

102 return urllib.parse.parse_qsl(self.request.query_string, keep_blank_values=True) 

103 

104 async def suggest(self): 

105 return [] 

106 

107 async def facet_results(self): 

108 # returns ([results], [timed_out]) 

109 # TODO: Include "hideable" with each one somehow, which indicates if it was 

110 # defined in metadata (in which case you cannot turn it off) 

111 raise NotImplementedError 

112 

113 async def get_columns(self, sql, params=None): 

114 # Detect column names using the "limit 0" trick 

115 return ( 

116 await self.ds.execute( 

117 self.database, "select * from ({}) limit 0".format(sql), params or [] 

118 ) 

119 ).columns 

120 

121 async def get_row_count(self): 

122 if self.row_count is None: 

123 self.row_count = ( 

124 await self.ds.execute( 

125 self.database, 

126 "select count(*) from ({})".format(self.sql), 

127 self.params, 

128 ) 

129 ).rows[0][0] 

130 return self.row_count 

131 

132 

133class ColumnFacet(Facet): 

134 type = "column" 

135 

136 async def suggest(self): 

137 row_count = await self.get_row_count() 

138 columns = await self.get_columns(self.sql, self.params) 

139 facet_size = self.ds.config("default_facet_size") 

140 suggested_facets = [] 

141 already_enabled = [c["config"]["simple"] for c in self.get_configs()] 

142 for column in columns: 

143 if column in already_enabled: 

144 continue 

145 suggested_facet_sql = """ 

146 select {column}, count(*) as n from ( 

147 {sql} 

148 ) where {column} is not null 

149 group by {column} 

150 limit {limit} 

151 """.format( 

152 column=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 

153 ) 

154 distinct_values = None 

155 try: 

156 distinct_values = await self.ds.execute( 

157 self.database, 

158 suggested_facet_sql, 

159 self.params, 

160 truncate=False, 

161 custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), 

162 ) 

163 num_distinct_values = len(distinct_values) 

164 if ( 

165 num_distinct_values 

166 and num_distinct_values > 1 

167 and num_distinct_values <= facet_size 

168 and num_distinct_values < row_count 

169 # And at least one has n > 1 

170 and any(r["n"] > 1 for r in distinct_values) 

171 ): 

172 suggested_facets.append( 

173 { 

174 "name": column, 

175 "toggle_url": self.ds.absolute_url( 

176 self.request, 

177 path_with_added_args(self.request, {"_facet": column}), 

178 ), 

179 } 

180 ) 

181 except QueryInterrupted: 

182 continue 

183 return suggested_facets 

184 

185 async def facet_results(self): 

186 facet_results = {} 

187 facets_timed_out = [] 

188 

189 qs_pairs = self.get_querystring_pairs() 

190 

191 facet_size = self.ds.config("default_facet_size") 

192 for source_and_config in self.get_configs(): 

193 config = source_and_config["config"] 

194 source = source_and_config["source"] 

195 column = config.get("column") or config["simple"] 

196 facet_sql = """ 

197 select {col} as value, count(*) as count from ( 

198 {sql} 

199 ) 

200 where {col} is not null 

201 group by {col} order by count desc, value limit {limit} 

202 """.format( 

203 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 

204 ) 

205 try: 

206 facet_rows_results = await self.ds.execute( 

207 self.database, 

208 facet_sql, 

209 self.params, 

210 truncate=False, 

211 custom_time_limit=self.ds.config("facet_time_limit_ms"), 

212 ) 

213 facet_results_values = [] 

214 facet_results[column] = { 

215 "name": column, 

216 "type": self.type, 

217 "hideable": source != "metadata", 

218 "toggle_url": path_with_removed_args( 

219 self.request, {"_facet": column} 

220 ), 

221 "results": facet_results_values, 

222 "truncated": len(facet_rows_results) > facet_size, 

223 } 

224 facet_rows = facet_rows_results.rows[:facet_size] 

225 if self.table: 

226 # Attempt to expand foreign keys into labels 

227 values = [row["value"] for row in facet_rows] 

228 expanded = await self.ds.expand_foreign_keys( 

229 self.database, self.table, column, values 

230 ) 

231 else: 

232 expanded = {} 

233 for row in facet_rows: 

234 selected = (column, str(row["value"])) in qs_pairs 

235 if selected: 

236 toggle_path = path_with_removed_args( 

237 self.request, {column: str(row["value"])} 

238 ) 

239 else: 

240 toggle_path = path_with_added_args( 

241 self.request, {column: row["value"]} 

242 ) 

243 facet_results_values.append( 

244 { 

245 "value": row["value"], 

246 "label": expanded.get((column, row["value"]), row["value"]), 

247 "count": row["count"], 

248 "toggle_url": self.ds.absolute_url( 

249 self.request, toggle_path 

250 ), 

251 "selected": selected, 

252 } 

253 ) 

254 except QueryInterrupted: 

255 facets_timed_out.append(column) 

256 

257 return facet_results, facets_timed_out 

258 

259 

260class ArrayFacet(Facet): 

261 type = "array" 

262 

263 def _is_json_array_of_strings(self, json_string): 

264 try: 

265 array = json.loads(json_string) 

266 except ValueError: 

267 return False 

268 for item in array: 

269 if not isinstance(item, str): 

270 return False 

271 return True 

272 

273 async def suggest(self): 

274 columns = await self.get_columns(self.sql, self.params) 

275 suggested_facets = [] 

276 already_enabled = [c["config"]["simple"] for c in self.get_configs()] 

277 for column in columns: 

278 if column in already_enabled: 

279 continue 

280 # Is every value in this column either null or a JSON array? 

281 suggested_facet_sql = """ 

282 select distinct json_type({column}) 

283 from ({sql}) 

284 """.format( 

285 column=escape_sqlite(column), sql=self.sql 

286 ) 

287 try: 

288 results = await self.ds.execute( 

289 self.database, 

290 suggested_facet_sql, 

291 self.params, 

292 truncate=False, 

293 custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), 

294 log_sql_errors=False, 

295 ) 

296 types = tuple(r[0] for r in results.rows) 

297 if types in (("array",), ("array", None)): 

298 # Now sanity check that first 100 arrays contain only strings 

299 first_100 = [ 

300 v[0] 

301 for v in await self.ds.execute( 

302 self.database, 

303 "select {column} from ({sql}) where {column} is not null and json_array_length({column}) > 0 limit 100".format( 

304 column=escape_sqlite(column), sql=self.sql 

305 ), 

306 self.params, 

307 truncate=False, 

308 custom_time_limit=self.ds.config( 

309 "facet_suggest_time_limit_ms" 

310 ), 

311 log_sql_errors=False, 

312 ) 

313 ] 

314 if first_100 and all( 

315 self._is_json_array_of_strings(r) for r in first_100 

316 ): 

317 suggested_facets.append( 

318 { 

319 "name": column, 

320 "type": "array", 

321 "toggle_url": self.ds.absolute_url( 

322 self.request, 

323 path_with_added_args( 

324 self.request, {"_facet_array": column} 

325 ), 

326 ), 

327 } 

328 ) 

329 except (QueryInterrupted, sqlite3.OperationalError): 

330 continue 

331 return suggested_facets 

332 

333 async def facet_results(self): 

334 # self.configs should be a plain list of columns 

335 facet_results = {} 

336 facets_timed_out = [] 

337 

338 facet_size = self.ds.config("default_facet_size") 

339 for source_and_config in self.get_configs(): 

340 config = source_and_config["config"] 

341 source = source_and_config["source"] 

342 column = config.get("column") or config["simple"] 

343 facet_sql = """ 

344 select j.value as value, count(*) as count from ( 

345 {sql} 

346 ) join json_each({col}) j 

347 group by j.value order by count desc, value limit {limit} 

348 """.format( 

349 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 

350 ) 

351 try: 

352 facet_rows_results = await self.ds.execute( 

353 self.database, 

354 facet_sql, 

355 self.params, 

356 truncate=False, 

357 custom_time_limit=self.ds.config("facet_time_limit_ms"), 

358 ) 

359 facet_results_values = [] 

360 facet_results[column] = { 

361 "name": column, 

362 "type": self.type, 

363 "results": facet_results_values, 

364 "hideable": source != "metadata", 

365 "toggle_url": path_with_removed_args( 

366 self.request, {"_facet_array": column} 

367 ), 

368 "truncated": len(facet_rows_results) > facet_size, 

369 } 

370 facet_rows = facet_rows_results.rows[:facet_size] 

371 pairs = self.get_querystring_pairs() 

372 for row in facet_rows: 

373 value = str(row["value"]) 

374 selected = ("{}__arraycontains".format(column), value) in pairs 

375 if selected: 

376 toggle_path = path_with_removed_args( 

377 self.request, {"{}__arraycontains".format(column): value} 

378 ) 

379 else: 

380 toggle_path = path_with_added_args( 

381 self.request, {"{}__arraycontains".format(column): value} 

382 ) 

383 facet_results_values.append( 

384 { 

385 "value": value, 

386 "label": value, 

387 "count": row["count"], 

388 "toggle_url": self.ds.absolute_url( 

389 self.request, toggle_path 

390 ), 

391 "selected": selected, 

392 } 

393 ) 

394 except QueryInterrupted: 

395 facets_timed_out.append(column) 

396 

397 return facet_results, facets_timed_out 

398 

399 

400class DateFacet(Facet): 

401 type = "date" 

402 

403 async def suggest(self): 

404 columns = await self.get_columns(self.sql, self.params) 

405 already_enabled = [c["config"]["simple"] for c in self.get_configs()] 

406 suggested_facets = [] 

407 for column in columns: 

408 if column in already_enabled: 

409 continue 

410 # Does this column contain any dates in the first 100 rows? 

411 suggested_facet_sql = """ 

412 select date({column}) from ( 

413 {sql} 

414 ) where {column} glob "????-??-*" limit 100; 

415 """.format( 

416 column=escape_sqlite(column), sql=self.sql 

417 ) 

418 try: 

419 results = await self.ds.execute( 

420 self.database, 

421 suggested_facet_sql, 

422 self.params, 

423 truncate=False, 

424 custom_time_limit=self.ds.config("facet_suggest_time_limit_ms"), 

425 log_sql_errors=False, 

426 ) 

427 values = tuple(r[0] for r in results.rows) 

428 if any(values): 

429 suggested_facets.append( 

430 { 

431 "name": column, 

432 "type": "date", 

433 "toggle_url": self.ds.absolute_url( 

434 self.request, 

435 path_with_added_args( 

436 self.request, {"_facet_date": column} 

437 ), 

438 ), 

439 } 

440 ) 

441 except (QueryInterrupted, sqlite3.OperationalError): 

442 continue 

443 return suggested_facets 

444 

445 async def facet_results(self): 

446 facet_results = {} 

447 facets_timed_out = [] 

448 args = dict(self.get_querystring_pairs()) 

449 facet_size = self.ds.config("default_facet_size") 

450 for source_and_config in self.get_configs(): 

451 config = source_and_config["config"] 

452 source = source_and_config["source"] 

453 column = config.get("column") or config["simple"] 

454 # TODO: does this query break if inner sql produces value or count columns? 

455 facet_sql = """ 

456 select date({col}) as value, count(*) as count from ( 

457 {sql} 

458 ) 

459 where date({col}) is not null 

460 group by date({col}) order by count desc, value limit {limit} 

461 """.format( 

462 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 

463 ) 

464 try: 

465 facet_rows_results = await self.ds.execute( 

466 self.database, 

467 facet_sql, 

468 self.params, 

469 truncate=False, 

470 custom_time_limit=self.ds.config("facet_time_limit_ms"), 

471 ) 

472 facet_results_values = [] 

473 facet_results[column] = { 

474 "name": column, 

475 "type": self.type, 

476 "results": facet_results_values, 

477 "hideable": source != "metadata", 

478 "toggle_url": path_with_removed_args( 

479 self.request, {"_facet_date": column} 

480 ), 

481 "truncated": len(facet_rows_results) > facet_size, 

482 } 

483 facet_rows = facet_rows_results.rows[:facet_size] 

484 for row in facet_rows: 

485 selected = str(args.get("{}__date".format(column))) == str( 

486 row["value"] 

487 ) 

488 if selected: 

489 toggle_path = path_with_removed_args( 

490 self.request, {"{}__date".format(column): str(row["value"])} 

491 ) 

492 else: 

493 toggle_path = path_with_added_args( 

494 self.request, {"{}__date".format(column): row["value"]} 

495 ) 

496 facet_results_values.append( 

497 { 

498 "value": row["value"], 

499 "label": row["value"], 

500 "count": row["count"], 

501 "toggle_url": self.ds.absolute_url( 

502 self.request, toggle_path 

503 ), 

504 "selected": selected, 

505 } 

506 ) 

507 except QueryInterrupted: 

508 facets_timed_out.append(column) 

509 

510 return facet_results, facets_timed_out