Last active
January 28, 2017 10:41
-
-
Save gahabeen/990230bf32094ebbb8da52ce5e92be57 to your computer and use it in GitHub Desktop.
jsonframe data extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let cheerio = require('cheerio'); | |
let jsonframe = require('jsonframe-cheerio'); | |
let $ = cheerio.load('our html page url here'); | |
jsonframe($); // initializes the plugin | |
var frame = { | |
"companies": { // setting the parent item as "companies" | |
"selector": ".item", // defines the elements to search for | |
"data": [{ // "data": [{}] defines a list of items | |
"name": ".header [itemprop=name]", // inline selector defining "name" so "company"."name" | |
"description": ".header [rel=description]", // inline selector defining "description" as "company"."description" | |
"url": { // defining "url" by an attribute with "attr" and "selector" in an object | |
"selector": ".header [itemprop=name]", // is actually the same as the inline selector | |
"attr": "href" // the attribute name to retrieve | |
}, | |
"contact": { // set up a parent "contact" element as "company"."contact" | |
"selector": ".contact", // defines the element to search for | |
"data": { // defines the data which "contact" will contain | |
"telephone": { // using "type" to use "telephone" parser to extract only the telephone | |
"selector": "[itemprop=telephone]", // simple selector for "telephone" | |
"type": "telephone" // using "telephone" plugin parser | |
}, | |
"employee": { // setting a parent node "employee" as "company"."contact"."employee" | |
"name": "[itemprop=employeeName]", // inline selector defining "name" | |
"jobTitle": "[itemprop=employeeJobTitle]", // inline selector defining "jobtitle" | |
"email": { // using "type" to use "email" parser to extract only the email | |
"selector": "[itemprop=email]", // simple selector for "email" | |
"type": "email" // using "email" plugin parser | |
} | |
} | |
} | |
} | |
}] | |
} | |
}; | |
var companiesList = $('.list.items').scrape(frame); | |
console.log(companiesList); // Output the data in the terminal |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment