Skip to content

Instantly share code, notes, and snippets.

Last active February 4, 2023 00:42
Show Gist options
  • Save cdrini/520c85be68c8768aefb84f19939e7def to your computer and use it in GitHub Desktop.
Save cdrini/520c85be68c8768aefb84f19939e7def to your computer and use it in GitHub Desktop.
Open Library Data Quality Score

This is a bookmarklet that gives some heuristics for data quality of the books in a search result, author page, or list.


To use, go to this page and copy the bookmarklet! (GitHub won't let me place the bookmarklet here).

Get the Bookmarklet

Supported pages:

  • ✅ Search results pages
  • ✅ Authors pages
  • ✅* Lists pages
    • Some of the checks don't work on lists pages
    • Authors or subjects stored on lists are ignored. Only works/editions are included
    • Checks are always applied at the work level, even for editions on a list
    • Sometimes list seeds endpoint errors :/ See internetarchive/openlibrary#5415
    • Lists with more than ~200 seem to error

Here are some pages to try it on:

async function main() {
class DQSChecks {
* @param {Object} opts
* @param {AbstractPageHandler} opts.pageHandler
constructor({checks, pageHandler}) {
this.checks = checks;
this.pageHandler = pageHandler;
this.table = document.createElement('table');
this.table.classList.add('dqs-results'); = document.createElement('style'); = `
.dqs-running-check td { padding: 0 8px; }
.dqs-running-check:nth-child(even) td { background: #eee; }
/** Override the weird stuff we do on mobile */
.dqs-results td { display: table-cell !important; }
.dqs-results tr { display: table-row !important; }
button.dqs-run-again { border: 0; background: none; cursor: pointer;}
async init() {
this.table.textContent = 'Loading...';
try {
await this.pageHandler.init();
} catch (e) {
if (e instanceof ListSeedsError) {
const url = new URL(e.url, location.origin);
url.searchParams.set('debug', 'true');
this.table.innerHTML = `
Error: Unable to fetch list seeds:
Possibly related to <a href="">issue #5415</a> for more information; see if you
get the same error when visiting <a href="${url}">/seeds.json</a>. If so, give that issue a thumbs up.
throw e;
this.table.textContent = '';
this.runningChecks = => new RunningCheck(check, this.pageHandler));
for (const rc of this.runningChecks) {
mount() {
unmount() {;
const CHECKS = [
name: 'At least 1 subject',
query: 'NOT subject:*',
name: 'At least 1 author',
query: 'NOT author_key:*',
name: 'At least 1 edition',
query: 'edition_count:0',
name: 'Has work (orphaned)',
query: 'key:*M',
name: 'Has publication year',
query: 'NOT publish_year:*',
name: 'Has cover',
query: 'NOT cover_i:*',
name: 'Has language',
query: 'NOT language:*',
name: 'Has publisher',
query: 'NOT publisher:*',
name: 'At least 2 editions',
query: 'edition_count:[0 TO 1]',
name: 'Has dewey decimal',
query: 'NOT ddc:*',
name: 'Has LoC classification',
query: 'NOT lcc:*',
name: 'Has number of pages',
query: 'NOT number_of_pages_median:*',
class RunningCheck {
/** @param {name: string, query: string} check */
constructor(check, pageHandler) {
this.check = check;
this.pageHandler = pageHandler;
this.view = document.createElement('tr');
this.results = null;
this.done = false;
this.error = null;
async run() {
this.error = null;
const url = new URL(this.pageHandler.getSearchUrl(this.check.query, true), location.origin);
url.searchParams.set('rows', 0);
let textResp = null;
try {
textResp = await fetch(url).then(res => res.text());
this.results = JSON.parse(textResp);
} catch (e) {
this.error = textResp || e;
this.done = true;
updateView() {
if (!this.done) {
this.view.innerHTML = `
<td colspan="3">Loading...</td>
} else if (this.error) {
const href = this.pageHandler.getSearchUrl(this.check.query, false);
this.view.innerHTML = `
<td colspan="2"><a href="${href}">Error</a></td>
<td><button class="dqs-run-again" title="Run again"><img src="" width="20px"></button></td>
this.view.querySelector('td:nth-child(2)').title = this.error;
} else {
const totalCount = this.pageHandler.totalCount;
const percent = Math.floor((totalCount - this.results.numFound) / totalCount * 100);
const href = this.pageHandler.getSearchUrl(this.check.query, false);
this.view.innerHTML = `
title="${totalCount - this.results.numFound} of ${totalCount}"
></meter> ${percent}%
<td style="text-align:right"><a href="${href}">${this.results.numFound} failing</a></td>
<td><button class="dqs-run-again" title="Run again"><img src="" width="20px"></button></td>
this.view.querySelector('.dqs-run-again')?.addEventListener('click', ev => {
this.done = false;
}, { once: true });
class AbstractPageHandler {
test() { throw new Error('Not implemented'); }
get totalCount() { throw new Error('Not implemented'); }
get curQuery() { throw new Error('Not implemented'); }
get insertionPoint() { return document.getElementById('contentBody'); }
async init() { return ; }
getSearchUrl(extraQuery='', json=true) {
return `/search${json ? '.json' : ''}?` + new URLSearchParams({
q: this.curQuery + ' ' + extraQuery
class SearchPageHandler extends AbstractPageHandler {
test() {
return location.pathname == '/search';
get totalCount() {
return parseFloat(document.querySelector('#contentHead .darkgreen').textContent.trim().replace(/\,/g, ''))
get curQuery() {
* Because the default search can also have stuff in url parameters, we don't try to convert
* those back into a lucene query. This only supports `getSearchUrl`.
throw new Error('Not supported');
getSearchUrl(extraQuery='', json=false) {
const url = json ? new URL(location.toString().replace('/search', '/search.json')) : new URL(location.toString());
const queryPart = url.searchParams.get('q')
/** This is an OL bug :/ */
.replace(/author:/g, 'author_name:');
url.searchParams.set('q', `${queryPart} ${extraQuery}`);
return url;
class AuthorPageHandler extends AbstractPageHandler {
test() {
return location.pathname.startsWith('/authors');
get totalCount() {
return parseFloat(document.querySelector('#works h2').textContent.trim().replace(/\,/g, ''));
get curQuery() {
return `author_key:${location.pathname.split('/')[2]}`;
class ListSeedsError extends Error {
constructor(url) {
this.url = url;
class ListPageHandler extends AbstractPageHandler {
test() {
return location.pathname.includes('/lists/');
async init() {
const url = /\/lists\/OL\d+L$/.test(location.pathname) ? location.pathname + '/seeds.json' : location.pathname.replace(/\/[^\/]+$/, '/seeds.json');
try {
this.seeds = await fetch(url).then(res => res.json());
} catch (e) {
throw new ListSeedsError(url);
get totalCount() {
return this.seeds.size;
get curQuery() {
const parts = =>
e.type == 'edition' ? ['edition_key', e.url.split('/')[2]] :
e.type == 'work' ? ['key', e.url] :
).filter(x => x);
/* Lazy groupBy */
const groups = {};
for (const [k, v] of parts) {
groups[k] = groups[k] || [];
return `(${Array.from(Object.entries(groups)).map(([k, v]) => `${k}:(${v.join(' OR ')})`).join(' ')})`;
get insertionPoint() {
/* When on my own lists, want it in .details-content. Public lists should use the default. */
return document.querySelector('.details-content') || super.insertionPoint;
class UnsupportedPageError extends Error {};
async function getResultsCount(query) {
const url = `${new URLSearchParams({
q: query,
rows: 0
const res = await fetch(url).then(res => res.json());
return res.numFound;
const handlers = [
new SearchPageHandler(),
new AuthorPageHandler(),
new ListPageHandler(),
async function start() {
const handler = handlers.find(h => h.test());
if (!handler) {
throw new UnsupportedPageError();
if (window.DQS_GLOBAL) {
window.DQS_GLOBAL = new DQSChecks({
checks: CHECKS,
pageHandler: handler,
try {
await start();
} catch (e) {
if (e instanceof UnsupportedPageError) {
alert('Data Quality Score only works on search or author pages.');
} else {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment