Created
November 25, 2018 17:59
-
-
Save imesut/d6d44db101b4a434064d88cfebe647eb to your computer and use it in GitHub Desktop.
Get text in selected area by pdf.js with text layer rendering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This method works only when pdf is rendered with Text Layer | |
// It compares particular text/word element coordinates with | |
// the rectangle's coordinates. If text/word coordinates is in | |
// the rectangle, text has got. And successing texts are similar. | |
// Check out text layer rendering option at the article below. | |
// https://www.sitepoint.com/custom-pdf-rendering/ | |
function getTextInRect(pageNumber, Xi, Yi, Xl, Yl) { // modify pageNumber if required | |
// Get the page if page render method works like (page-1, page-2, ...) | |
// If not, modify according to | |
var page = $("#page-" + pageNumber + " > div")[0]; | |
var textInRect = ""; | |
for (j = 0; j < page.childNodes.length; j++) { | |
// get text parts | |
var textPart = page.childNodes[j]; | |
// if text parts are between the rectangle borders | |
if (textPart.style.left != "" & textPart.style.top != "") { | |
textPartX = parseFloat(textPart.style.left.slice(0, -2)); | |
textPartY = parseFloat(textPart.style.top.slice(0, -2)); | |
// Get the text part | |
if (textPartX > Xi & textPartX < Xl & textPartY > Yi & textPartY < Yl) { | |
textInRect = textInRect + textPart.textContent; | |
} | |
} | |
} | |
// concatenated text | |
return textInRect; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment