Skip to content

Instantly share code, notes, and snippets.

@Andrews54757
Created January 22, 2018 20:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Andrews54757/fef89705a1fafdc48c85a21823c30987 to your computer and use it in GitHub Desktop.
Save Andrews54757/fef89705a1fafdc48c85a21823c30987 to your computer and use it in GitHub Desktop.
Get text from website preserving position
function isHidden(el) {
var style = window.getComputedStyle(el);
return (style.display === 'none')
}
function getOffset(el) {
var _x = 0;
var _y = 0;
while (el && !isNaN(el.offsetLeft) && !isNaN(el.offsetTop)) {
_x += el.offsetLeft - el.scrollLeft;
_y += el.offsetTop - el.scrollTop;
el = el.offsetParent;
}
return {
top: _y,
left: _x
};
}
function buildFromNode(node, arr) {
arr = arr || [];
for (var i = 0; i < node.childNodes.length; i++) {
var child = node.childNodes[i];
var tag = child.tagName ? child.tagName.toLowerCase() : ''
if (child.nodeType === 1 && tag !== 'script' && tag !== 'link' && tag !== 'img' && tag !== 'style') {
if (isHidden(child)) continue;
buildFromNode(child, arr);
} else if (child.nodeType === 3) {
var val = child.nodeValue.trim().replace(/\s+/g, ' ');
if (val) {
var rect = getOffset(node)
var index = Math.floor(rect.top / 10);
var index2 = Math.floor(rect.left / 10);
if (!arr[index]) arr[index] = [];
if (!arr[index][index2]) arr[index][index2] = [];
arr[index][index2].push(val);
}
}
}
return arr;
}
function getTextFromNode(node) {
var arr = buildFromNode(node)
return arr.map((n) => {
return n.filter((m) => {
return m && m.length > 0;
}).map((m) => {
return m.join(' ');
})
}).filter((n) => {
return n && n.length > 0;
}).map((n) => {
return n.join(' ').split(/[  ]/)
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment