Sematic Webpage Parse into a human and machine readable structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var request = require("request"); | |
const cheerio = require('cheerio'); | |
const extractor = require('unfluff'); | |
module.exports = WebpageSemanticParse = function (url) { | |
return new Promise((resolve, reject) => { | |
request.get(url, function (error, response, body) { | |
let unfluffedData = extractor(body); | |
const url = unfluffedData.canonicalLink; | |
const { hostname } = new URL(url); | |
let headingArr = [] | |
let returnObject = { | |
title: unfluffedData.title, | |
date: unfluffedData.date, | |
website: hostname, | |
author: unfluffedData.author, | |
description: unfluffedData.description, | |
publisher: unfluffedData.publisher, | |
// links: data.links, | |
mainImage: unfluffedData.image, | |
images: [{ | |
src: unfluffedData.image, | |
}], | |
content: unfluffedData.text.split('\n\n'), | |
sections: headingArr, | |
// html:null | |
}//returnObject.content=returnObject.content.split('\n\n') | |
parseHTML(body, unfluffedData.text.split('\n\n'), unfluffedData.title/*returnObject.content*/).then((returnArr) => { | |
//return `<img src=https:${item}>` | |
// returnObject.html=returnArr.html; | |
returnArr.images.map(item => { | |
if (!item && item === undefined) { | |
return | |
} | |
if (!item.includes('//')) { | |
item = '//' + hostname + item | |
} | |
if (item === unfluffedData.image.toLowerCase()) { | |
return | |
} | |
returnObject.images.push({ src: item.includes('http') ? item : `https:${item}` }) | |
}); | |
returnObject.sections = returnArr.heading | |
resolve(returnObject) | |
}) | |
}); | |
}) | |
} | |
async function parseHTML(body, unfluffed, title) { | |
const $ = cheerio.load(body); | |
let imageArr = [] | |
let headingArr = [] | |
let returnArr = { | |
images: imageArr, | |
heading: headingArr, | |
Firstcontent: [''] | |
} | |
$("img").map((_, element) => { | |
returnArr.images.push(element.attribs.src) | |
}); | |
$("figure").map((_, element) => { | |
returnArr.images.push(element.attribs.src) | |
}); | |
// let titleElement = $(`h1:contains("${title}")`); | |
let nextIteration = true; | |
$("h2, h3, h4, h5, h6").map((i, element) => { | |
// console.log(i,element.tagName) | |
// console.log($(element).text()) | |
let section = { heading: $(element).text(), content: '', contentCleaned: [], index: i }; | |
let sectionList = $(element).nextUntil("h2, h3, h4, h5, h6").filter('p, li, ul, img, figure, img,ol'); | |
sectionList.each((index, element) => { | |
let firstFive = $(element).text().split(' ').slice(0, 5).join(' ') | |
// section.contentCleaned = section.contentCleaned+unfluffed.find(a =>a.includes(currentText)&¤tText!==""); | |
if (element.tagName === 'ul' || element.tagName === 'ol') { | |
const children = $(element).children();//access list | |
children.each((i, li) => { | |
const child = $(li).text(); | |
section.contentCleaned.push(child); | |
}); | |
// console.log('returning at ul') | |
return | |
} | |
if (element.tagName === 'figure' || element.tagName === 'img') { | |
section.contentCleaned.push($(element).text()); | |
// console.log('returning at figure|image') | |
return | |
} | |
if (element.tagName === 'div') { | |
// console.log('div'); | |
// console.log('returning at div') | |
return | |
} | |
if ($(element).text().split(' ').length < 5) { | |
//standalone P | |
section.contentCleaned.push($(element).text()); | |
// console.log('returning at lenght<5') | |
return | |
} | |
let text = $(element).text().replace(/(\r\n|\n|\r|\t)/gm, "").trim(); | |
let CleanMatch = unfluffed.find(a => a.includes(text)) | |
if (!CleanMatch) { | |
let firstPara = unfluffed.find(a => $(element).text().includes(a)); | |
if (firstPara) { | |
let lastPara = $(element).text().replace(firstPara, ""); | |
section.contentCleaned.push(firstPara); | |
section.contentCleaned.push(lastPara); | |
} | |
section.contentCleaned.push($(element).text()); | |
// console.log('returning at no clear match') | |
return | |
} | |
// console.log('clean match') | |
section.contentCleaned.push(CleanMatch) | |
}); | |
section.content = sectionList.text(); | |
if ((!section.content && section.content === '') || (!section.contentCleaned && section.contentCleaned.length === 0)) { | |
// console.log('returning at sectionContentLength 0',section.contentCleaned); | |
section.contentCleaned.push(section.content); | |
return | |
} | |
// section.contentCleaned = unfluffed.find(a =>a.includes(sectionList.split(' ').slice(0, 5).join(' '))&§ionList.split(' ').slice(0, 5).join(' ')!==""); | |
//element.nextSibling | |
if (nextIteration) { | |
nextIteration = false | |
unfluffLoop: | |
for (i = 0; i < unfluffed.length; i++) { | |
for (j = 0; j < section.contentCleaned.length; j++) { | |
if (section.contentCleaned[j] === unfluffed[i]) { | |
// console.log('breaking loop', section.contentCleaned[j], unfluffed[i]) | |
break unfluffLoop; | |
} | |
} | |
// console.log('return unfluffedI') | |
returnArr.heading.push(unfluffed[i]) | |
} | |
} | |
returnArr.heading.push(section); | |
// console.log('pushing section to heading') | |
}); | |
return returnArr | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment