Skip to content

Instantly share code, notes, and snippets.

@cdrini
Last active February 4, 2023 00:42
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cdrini/520c85be68c8768aefb84f19939e7def to your computer and use it in GitHub Desktop.
Save cdrini/520c85be68c8768aefb84f19939e7def to your computer and use it in GitHub Desktop.
Open Library Data Quality Score

This is a bookmarklet that gives some heuristics for data quality of the books in a search result, author page, or list.

image

To use, go to this page and copy the bookmarklet! (GitHub won't let me place the bookmarklet here).

Get the Bookmarklet

Supported pages:

  • ✅ Search results pages
  • ✅ Authors pages
  • ✅* Lists pages
    • Some of the checks don't work on lists pages
    • Authors or subjects stored on lists are ignored. Only works/editions are included
    • Checks are always applied at the work level, even for editions on a list
    • Sometimes list seeds endpoint errors :/ See internetarchive/openlibrary#5415
    • Lists with more than ~200 seem to error

Here are some pages to try it on:

javascript:
async function main() {
class DQSChecks {
/**
*
* @param {Object} opts
* @param {AbstractPageHandler} opts.pageHandler
*/
constructor({checks, pageHandler}) {
this.checks = checks;
this.pageHandler = pageHandler;
this.table = document.createElement('table');
this.table.classList.add('dqs-results');
this.style = document.createElement('style');
this.style.textContent = `
.dqs-running-check td { padding: 0 8px; }
.dqs-running-check:nth-child(even) td { background: #eee; }
/** Override the weird stuff we do on mobile */
.dqs-results td { display: table-cell !important; }
.dqs-results tr { display: table-row !important; }
button.dqs-run-again { border: 0; background: none; cursor: pointer;}
`;
this.init();
}
async init() {
this.mount();
this.table.textContent = 'Loading...';
try {
debugger;
await this.pageHandler.init();
} catch (e) {
if (e instanceof ListSeedsError) {
const url = new URL(e.url, location.origin);
url.searchParams.set('debug', 'true');
this.table.innerHTML = `
Error: Unable to fetch list seeds:
Possibly related to <a href="https://github.com/internetarchive/openlibrary/issues/5415">issue #5415</a> for more information; see if you
get the same error when visiting <a href="${url}">/seeds.json</a>. If so, give that issue a thumbs up.
`;
}
throw e;
}
this.table.textContent = '';
this.runningChecks = this.checks.map(check => new RunningCheck(check, this.pageHandler));
for (const rc of this.runningChecks) {
this.table.appendChild(rc.view);
}
}
mount() {
document.head.appendChild(this.style);
this.pageHandler.insertionPoint.prepend(this.table);
}
unmount() {
this.style.remove();
this.table.remove();
}
};
const CHECKS = [
{
name: 'At least 1 subject',
query: 'NOT subject:*',
},
{
name: 'At least 1 author',
query: 'NOT author_key:*',
},
{
name: 'At least 1 edition',
query: 'edition_count:0',
},
{
name: 'Has work (orphaned)',
query: 'key:*M',
},
{
name: 'Has publication year',
query: 'NOT publish_year:*',
},
{
name: 'Has cover',
query: 'NOT cover_i:*',
},
{
name: 'Has language',
query: 'NOT language:*',
},
{
name: 'Has publisher',
query: 'NOT publisher:*',
},
{
name: 'At least 2 editions',
query: 'edition_count:[0 TO 1]',
},
{
name: 'Has dewey decimal',
query: 'NOT ddc:*',
},
{
name: 'Has LoC classification',
query: 'NOT lcc:*',
},
{
name: 'Has number of pages',
query: 'NOT number_of_pages_median:*',
}
];
class RunningCheck {
/** @param {name: string, query: string} check */
constructor(check, pageHandler) {
this.check = check;
this.pageHandler = pageHandler;
this.view = document.createElement('tr');
this.view.classList.add('dqs-running-check');
this.results = null;
this.done = false;
this.error = null;
this.updateView();
this.run();
}
async run() {
this.error = null;
const url = new URL(this.pageHandler.getSearchUrl(this.check.query, true), location.origin);
url.searchParams.set('rows', 0);
let textResp = null;
try {
textResp = await fetch(url).then(res => res.text());
this.results = JSON.parse(textResp);
} catch (e) {
this.error = textResp || e;
}
this.done = true;
this.updateView();
}
updateView() {
if (!this.done) {
this.view.innerHTML = `
<td><b>${this.check.name}</b></td>
<td colspan="3">Loading...</td>
`;
} else if (this.error) {
const href = this.pageHandler.getSearchUrl(this.check.query, false);
this.view.innerHTML = `
<td><b>${this.check.name}</b></td>
<td colspan="2"><a href="${href}">Error</a></td>
<td><button class="dqs-run-again" title="Run again"><img src="https://upload.wikimedia.org/wikipedia/commons/2/2a/Gnome-view-refresh.svg" width="20px"></button></td>
`;
this.view.querySelector('td:nth-child(2)').title = this.error;
} else {
const totalCount = this.pageHandler.totalCount;
const percent = Math.floor((totalCount - this.results.numFound) / totalCount * 100);
const href = this.pageHandler.getSearchUrl(this.check.query, false);
this.view.innerHTML = `
<td><b>${this.check.name}</b></td>
<td>
<meter
value="${percent}"
min="0"
max="100"
title="${totalCount - this.results.numFound} of ${totalCount}"
></meter> ${percent}%
</td>
<td style="text-align:right"><a href="${href}">${this.results.numFound} failing</a></td>
<td><button class="dqs-run-again" title="Run again"><img src="https://upload.wikimedia.org/wikipedia/commons/2/2a/Gnome-view-refresh.svg" width="20px"></button></td>
`;
}
this.view.querySelector('.dqs-run-again')?.addEventListener('click', ev => {
ev.preventDefault();
this.done = false;
this.updateView();
this.run();
}, { once: true });
}
};
class AbstractPageHandler {
test() { throw new Error('Not implemented'); }
get totalCount() { throw new Error('Not implemented'); }
get curQuery() { throw new Error('Not implemented'); }
get insertionPoint() { return document.getElementById('contentBody'); }
async init() { return ; }
getSearchUrl(extraQuery='', json=true) {
return `/search${json ? '.json' : ''}?` + new URLSearchParams({
q: this.curQuery + ' ' + extraQuery
});
}
}
class SearchPageHandler extends AbstractPageHandler {
test() {
return location.pathname == '/search';
}
get totalCount() {
return parseFloat(document.querySelector('#contentHead .darkgreen').textContent.trim().replace(/\,/g, ''))
}
get curQuery() {
/**
* Because the default search can also have stuff in url parameters, we don't try to convert
* those back into a lucene query. This only supports `getSearchUrl`.
*/
throw new Error('Not supported');
}
getSearchUrl(extraQuery='', json=false) {
const url = json ? new URL(location.toString().replace('/search', '/search.json')) : new URL(location.toString());
const queryPart = url.searchParams.get('q')
/** This is an OL bug :/ */
.replace(/author:/g, 'author_name:');
url.searchParams.set('q', `${queryPart} ${extraQuery}`);
return url;
}
}
class AuthorPageHandler extends AbstractPageHandler {
test() {
return location.pathname.startsWith('/authors');
}
get totalCount() {
return parseFloat(document.querySelector('#works h2').textContent.trim().replace(/\,/g, ''));
}
get curQuery() {
return `author_key:${location.pathname.split('/')[2]}`;
}
}
class ListSeedsError extends Error {
constructor(url) {
super();
this.url = url;
}
}
class ListPageHandler extends AbstractPageHandler {
test() {
return location.pathname.includes('/lists/');
}
async init() {
const url = /\/lists\/OL\d+L$/.test(location.pathname) ? location.pathname + '/seeds.json' : location.pathname.replace(/\/[^\/]+$/, '/seeds.json');
try {
this.seeds = await fetch(url).then(res => res.json());
} catch (e) {
throw new ListSeedsError(url);
}
}
get totalCount() {
return this.seeds.size;
}
get curQuery() {
const parts = this.seeds.entries.map(e =>
e.type == 'edition' ? ['edition_key', e.url.split('/')[2]] :
e.type == 'work' ? ['key', e.url] :
null
).filter(x => x);
/* Lazy groupBy */
const groups = {};
for (const [k, v] of parts) {
groups[k] = groups[k] || [];
groups[k].push(v);
}
return `(${Array.from(Object.entries(groups)).map(([k, v]) => `${k}:(${v.join(' OR ')})`).join(' ')})`;
}
get insertionPoint() {
/* When on my own lists, want it in .details-content. Public lists should use the default. */
return document.querySelector('.details-content') || super.insertionPoint;
}
}
class UnsupportedPageError extends Error {};
async function getResultsCount(query) {
const url = `https://openlibrary.org/search.json?${new URLSearchParams({
q: query,
rows: 0
})}`;
const res = await fetch(url).then(res => res.json());
return res.numFound;
}
const handlers = [
new SearchPageHandler(),
new AuthorPageHandler(),
new ListPageHandler(),
];
async function start() {
const handler = handlers.find(h => h.test());
if (!handler) {
throw new UnsupportedPageError();
}
if (window.DQS_GLOBAL) {
window.DQS_GLOBAL.unmount();
}
window.DQS_GLOBAL = new DQSChecks({
checks: CHECKS,
pageHandler: handler,
});
}
try {
await start();
} catch (e) {
console.error(e);
if (e instanceof UnsupportedPageError) {
alert('Data Quality Score only works on search or author pages.');
} else {
alert(e);
}
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment