Skip to content

Instantly share code, notes, and snippets.

@mukulmishra18
Created April 17, 2017 15:33
Show Gist options
  • Save mukulmishra18/55409644ff6577b7b8c0a01cfdc9e762 to your computer and use it in GitHub Desktop.
Save mukulmishra18/55409644ff6577b7b8c0a01cfdc9e762 to your computer and use it in GitHub Desktop.
getTextContent function of PDF.js with Streams API.
getTextContent: function PDFPageProxy_getTextContent(params) {
var readableStream = this.transport.messageHandler.sendWithStream('GetTextContent', {
pageIndex: this.pageNumber - 1,
normalizeWhitespace: (params && params.normalizeWhitespace === true ?
true : /* Default */ false),
combineTextItems: (params && params.disableCombineTextItems === true ?
false : /* Default */ true),
});
return new Promise(function (resolve, reject) {
readAllChunks(readableStream);
function readAllChunks (readableStream) {
var reader = readableStream.getReader();
var textContent = {
items: [],
styles: {}
};
pump();
function pump () {
reader.read().then(function (result) {
if (result.done) {
resolve(textContent);
}
if (typeof result.value[0] == 'string') {
textContent.styles[result.value[0]] = result.value[1];
} else {
textContent.items.push(result.value[0]);
if (result.value[1]) {
textContent.items.push(result.value[1]);
}
}
pump();
}, function (error) {
reject(error);
});
}
}
});
}
handler.on('GetTextContent', function wphExtractText(data, sink) {
var pageIndex = data.pageIndex;
var normalizeWhitespace = data.normalizeWhitespace;
var combineTextItems = data.combineTextItems;
var getChunk = null;
sink.onPull = function (desiredSize) {
var chunk = getChunk(desiredSize);
if (chunk === 0) {
sink.close();
return;
}
sink.enqueue(chunk);
}
return new Promise(function (resolve, reject) {
pdfManager.getPage(pageIndex).then(function(page) {
var task = new WorkerTask('GetTextContent: page ' + pageIndex);
startWorkerTask(task);
var pageNum = pageIndex + 1;
var start = Date.now();
page.extractTextContent(handler, task, normalizeWhitespace,
combineTextItems).then(
function(textContent) {
finishWorkerTask(task);
info('text indexing: page=' + pageNum + ' - time=' +
(Date.now() - start) + 'ms');
var items = textContent.items;
var styles = textContent.styles;
var stylesKeys = Object.keys(styles);
getChunk = function (desiredSize) {
if (items.length === 0) {
return 0;
} else if (stylesKeys.length !== 0) {
var stylesKey = stylesKeys.splice(0, 1);
return [stylesKey[0], styles[stylesKey]];
} else if (items.length !== 0) {
return items.splice(0, desiredSize);
}
}
resolve();
}, function (reason) {
finishWorkerTask(task);
if (task.terminated) {
return; // ignoring errors from the terminated thread
}
reject(reason);
});
});
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment