Skip to content

Instantly share code, notes, and snippets.

@Scarysize
Last active February 18, 2017 11:47
Show Gist options
  • Save Scarysize/a77684d08c457e603629f62ffbeb11cd to your computer and use it in GitHub Desktop.
Save Scarysize/a77684d08c457e603629f62ffbeb11cd to your computer and use it in GitHub Desktop.
Scrape delicious bookmarks as JSON
/*
1. install jsdom
2. Usage:
$ node scrape-delicious.js https://del.icio.us/<username>
3. writes data to "bookmarks.json"
*/
const jsdom = require('jsdom');
const fs = require('fs');
function extractData(document) {
const articles = document.querySelectorAll('.articleThumbBlock ');
const entries = [...articles].map(article => {
const date = Number(article.parentNode.getAttribute('date'));
const title = article.querySelector('.title').textContent;
const url = article
.querySelector('.articleInfoPan')
.querySelector('a')
.getAttribute('href');
const thumbTBriefTxt = article.querySelector('.thumbTBriefTxt');
const paragraphs = thumbTBriefTxt.querySelectorAll('p');
const description = paragraphs.length > 1 ? paragraphs[1].textContent : '';
const tagList = thumbTBriefTxt.querySelector('ul.tagName');
let tags = [];
if (tagList) {
tags = [...tagList.querySelectorAll('a')]
.map(listItem => listItem.textContent);
}
return {
date,
description,
tags,
title,
url
};
});
return entries;
}
function scrape(url) {
return new Promise((resolve, reject) => {
jsdom.env(url, (err, window) => {
if (err) {
reject(err);
return;
}
const document = window.document;
const bookmarks = extractData(document);
resolve(bookmarks);
});
});
}
const baseUrl = process.argv[2];
if (!baseUrl) {
throw new Error('no delicious base url passed');
}
jsdom.env(baseUrl, (err, window) => {
const document = window.document;
const pagination = document.querySelector('.pagination');
const lastPage = Number(pagination.children[pagination.children.length - 2].textContent);
const scrapes = [];
for (let i = 1; i < lastPage + 1; i++) {
const url = `${baseUrl}?&page=${i}`;
scrapes.push(scrape(url));
}
Promise.all(scrapes)
.then(bookmarks => {
const flattened = bookmarks.reduce((acc, bookmarksFromPage) => {
acc.push(...bookmarksFromPage);
return acc;
}, []);
fs.writeFileSync('./bookmarks.json', JSON.stringify(flattened, null, 2));
console.log('done');
})
.catch(scrapeError => {
console.error(scrapeError);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment