Last active
February 18, 2017 11:47
-
-
Save Scarysize/a77684d08c457e603629f62ffbeb11cd to your computer and use it in GitHub Desktop.
Scrape delicious bookmarks as JSON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
1. install jsdom | |
2. Usage: | |
$ node scrape-delicious.js https://del.icio.us/<username> | |
3. writes data to "bookmarks.json" | |
*/ | |
const jsdom = require('jsdom'); | |
const fs = require('fs'); | |
function extractData(document) { | |
const articles = document.querySelectorAll('.articleThumbBlock '); | |
const entries = [...articles].map(article => { | |
const date = Number(article.parentNode.getAttribute('date')); | |
const title = article.querySelector('.title').textContent; | |
const url = article | |
.querySelector('.articleInfoPan') | |
.querySelector('a') | |
.getAttribute('href'); | |
const thumbTBriefTxt = article.querySelector('.thumbTBriefTxt'); | |
const paragraphs = thumbTBriefTxt.querySelectorAll('p'); | |
const description = paragraphs.length > 1 ? paragraphs[1].textContent : ''; | |
const tagList = thumbTBriefTxt.querySelector('ul.tagName'); | |
let tags = []; | |
if (tagList) { | |
tags = [...tagList.querySelectorAll('a')] | |
.map(listItem => listItem.textContent); | |
} | |
return { | |
date, | |
description, | |
tags, | |
title, | |
url | |
}; | |
}); | |
return entries; | |
} | |
function scrape(url) { | |
return new Promise((resolve, reject) => { | |
jsdom.env(url, (err, window) => { | |
if (err) { | |
reject(err); | |
return; | |
} | |
const document = window.document; | |
const bookmarks = extractData(document); | |
resolve(bookmarks); | |
}); | |
}); | |
} | |
const baseUrl = process.argv[2]; | |
if (!baseUrl) { | |
throw new Error('no delicious base url passed'); | |
} | |
jsdom.env(baseUrl, (err, window) => { | |
const document = window.document; | |
const pagination = document.querySelector('.pagination'); | |
const lastPage = Number(pagination.children[pagination.children.length - 2].textContent); | |
const scrapes = []; | |
for (let i = 1; i < lastPage + 1; i++) { | |
const url = `${baseUrl}?&page=${i}`; | |
scrapes.push(scrape(url)); | |
} | |
Promise.all(scrapes) | |
.then(bookmarks => { | |
const flattened = bookmarks.reduce((acc, bookmarksFromPage) => { | |
acc.push(...bookmarksFromPage); | |
return acc; | |
}, []); | |
fs.writeFileSync('./bookmarks.json', JSON.stringify(flattened, null, 2)); | |
console.log('done'); | |
}) | |
.catch(scrapeError => { | |
console.error(scrapeError); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment