Skip to content

Instantly share code, notes, and snippets.

@adigunturu
Last active September 29, 2022 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adigunturu/8c0d05bcf24f9f8fb09a8e1be54dbb01 to your computer and use it in GitHub Desktop.
Save adigunturu/8c0d05bcf24f9f8fb09a8e1be54dbb01 to your computer and use it in GitHub Desktop.
Sematic Webpage Parse into a human and machine readable structure
var request = require("request");
const cheerio = require('cheerio');
const extractor = require('unfluff');
module.exports = WebpageSemanticParse = function (url) {
return new Promise((resolve, reject) => {
request.get(url, function (error, response, body) {
let unfluffedData = extractor(body);
const url = unfluffedData.canonicalLink;
const { hostname } = new URL(url);
let headingArr = []
let returnObject = {
title: unfluffedData.title,
date: unfluffedData.date,
website: hostname,
author: unfluffedData.author,
description: unfluffedData.description,
publisher: unfluffedData.publisher,
// links: data.links,
mainImage: unfluffedData.image,
images: [{
src: unfluffedData.image,
}],
content: unfluffedData.text.split('\n\n'),
sections: headingArr,
// html:null
}//returnObject.content=returnObject.content.split('\n\n')
parseHTML(body, unfluffedData.text.split('\n\n'), unfluffedData.title/*returnObject.content*/).then((returnArr) => {
//return `<img src=https:${item}>`
// returnObject.html=returnArr.html;
returnArr.images.map(item => {
if (!item && item === undefined) {
return
}
if (!item.includes('//')) {
item = '//' + hostname + item
}
if (item === unfluffedData.image.toLowerCase()) {
return
}
returnObject.images.push({ src: item.includes('http') ? item : `https:${item}` })
});
returnObject.sections = returnArr.heading
resolve(returnObject)
})
});
})
}
async function parseHTML(body, unfluffed, title) {
const $ = cheerio.load(body);
let imageArr = []
let headingArr = []
let returnArr = {
images: imageArr,
heading: headingArr,
Firstcontent: ['']
}
$("img").map((_, element) => {
returnArr.images.push(element.attribs.src)
});
$("figure").map((_, element) => {
returnArr.images.push(element.attribs.src)
});
// let titleElement = $(`h1:contains("${title}")`);
let nextIteration = true;
$("h2, h3, h4, h5, h6").map((i, element) => {
// console.log(i,element.tagName)
// console.log($(element).text())
let section = { heading: $(element).text(), content: '', contentCleaned: [], index: i };
let sectionList = $(element).nextUntil("h2, h3, h4, h5, h6").filter('p, li, ul, img, figure, img,ol');
sectionList.each((index, element) => {
let firstFive = $(element).text().split(' ').slice(0, 5).join(' ')
// section.contentCleaned = section.contentCleaned+unfluffed.find(a =>a.includes(currentText)&&currentText!=="");
if (element.tagName === 'ul' || element.tagName === 'ol') {
const children = $(element).children();//access list
children.each((i, li) => {
const child = $(li).text();
section.contentCleaned.push(child);
});
// console.log('returning at ul')
return
}
if (element.tagName === 'figure' || element.tagName === 'img') {
section.contentCleaned.push($(element).text());
// console.log('returning at figure|image')
return
}
if (element.tagName === 'div') {
// console.log('div');
// console.log('returning at div')
return
}
if ($(element).text().split(' ').length < 5) {
//standalone P
section.contentCleaned.push($(element).text());
// console.log('returning at lenght<5')
return
}
let text = $(element).text().replace(/(\r\n|\n|\r|\t)/gm, "").trim();
let CleanMatch = unfluffed.find(a => a.includes(text))
if (!CleanMatch) {
let firstPara = unfluffed.find(a => $(element).text().includes(a));
if (firstPara) {
let lastPara = $(element).text().replace(firstPara, "");
section.contentCleaned.push(firstPara);
section.contentCleaned.push(lastPara);
}
section.contentCleaned.push($(element).text());
// console.log('returning at no clear match')
return
}
// console.log('clean match')
section.contentCleaned.push(CleanMatch)
});
section.content = sectionList.text();
if ((!section.content && section.content === '') || (!section.contentCleaned && section.contentCleaned.length === 0)) {
// console.log('returning at sectionContentLength 0',section.contentCleaned);
section.contentCleaned.push(section.content);
return
}
// section.contentCleaned = unfluffed.find(a =>a.includes(sectionList.split(' ').slice(0, 5).join(' '))&&sectionList.split(' ').slice(0, 5).join(' ')!=="");
//element.nextSibling
if (nextIteration) {
nextIteration = false
unfluffLoop:
for (i = 0; i < unfluffed.length; i++) {
for (j = 0; j < section.contentCleaned.length; j++) {
if (section.contentCleaned[j] === unfluffed[i]) {
// console.log('breaking loop', section.contentCleaned[j], unfluffed[i])
break unfluffLoop;
}
}
// console.log('return unfluffedI')
returnArr.heading.push(unfluffed[i])
}
}
returnArr.heading.push(section);
// console.log('pushing section to heading')
});
return returnArr
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment