Skip to content

Instantly share code, notes, and snippets.

@illarionvk
Created January 13, 2014 09:04
Show Gist options
  • Save illarionvk/8396889 to your computer and use it in GitHub Desktop.
Save illarionvk/8396889 to your computer and use it in GitHub Desktop.
Find elements in 50 HTML files, extract data and create Jekyll post files with YAML metadata using Node.js
// 1. Get list of files
// 2. For each file:
// 2.1 Read a file
// 2.2 Find required data
// 2.3 Put the information in a JSON object
// 3. Convert JSON object to YAML
// 4. Write new Markdown file in _posts folder
var fs = require('fs')
function getFileList(callback) {
fs.readdir('.', function (err, files) {
var hasHTMLextension = /\.html$/;
if (err)
throw err;
for (var index in files) {
//console.log(files[index]);
if (hasHTMLextension.test(files[index])) {
callback(null, files[index]);
}
}
});
};
function readFile(err, fileName) {
fs.readFile(fileName, 'utf8', function (err,data) {
if (err) {
return console.log(err);
}
//console.log(data);
console.log('File' + fileName + ' successfully read');
parseData(data, fileName);
});
};
function parseData(data, fileName) {
// Run some jQuery on a html fragment
var jsdom = require("jsdom");
jsdom.env(
data,
["http://code.jquery.com/jquery.js"],
function (errors, window) {
var YAML = require('json2yaml');
var yml;
var membersJSON = new Object();
var divTitle;
var descriptionTitle;
var divDescription, pDescription, description, memberLinkElement;
var divInfo, spanInfoTitle, spanInfoTitleParentDiv, spanData;
var divMemberLogo, imgTag, imgFullPath, imgFileNameOnly;
var divEmbedContainer, iframeTag, youtubeURL;
var i;
membersJSON.layout = 'members';
membersJSON.published = true;
divTitle = window.document.getElementsByClassName('title');
if (divTitle[0]) {
membersJSON.title = divTitle[0].textContent;
}
// description_title
divDescription = window.document.getElementsByClassName('description');
if (divDescription[0]) {
descriptionTitle = divDescription[0].getElementsByTagName('h3');
if (descriptionTitle[0]) {
membersJSON.description_title = descriptionTitle[0].textContent;
}
}
// description
divDescription = window.document.getElementsByClassName('description');
if (divDescription[0]) {
pDescription = divDescription[0].getElementsByTagName('p');
description = pDescription[0].textContent;
//membersJSON.description = pDescription[0].textContent;
if (pDescription[1]) {
memberLinkElement = pDescription[1].getElementsByTagName('a');
if (memberLinkElement[0]) {
membersJSON.member_url = memberLinkElement[0].href;
} else {
membersJSON.member_url = '';
}
}
}
// info block
divInfo = window.document.getElementsByClassName('info');
if (divInfo[0]) {
spanInfoTitle = divInfo[0].getElementsByTagName('span');
membersJSON.original_timestamp = spanInfoTitle[1].textContent;
membersJSON.contact = spanInfoTitle[3].textContent;
membersJSON.telephone = spanInfoTitle[5].textContent;
membersJSON.email = spanInfoTitle[7].textContent;
membersJSON.branch = spanInfoTitle[9].textContent;
}
// Member Logo
divMemberLogo = window.document.getElementsByClassName('member-logo');
if (divMemberLogo[0]) {
imgTag = divMemberLogo[0].getElementsByTagName('img');
if (imgTag[0]) {
imgFullPath = imgTag[0].src;
imgFileNameOnly = imgFullPath.replace(/^file:\/\/\/Users\/hex\/Dropbox\/Sites\/study\/images\//i, '');
membersJSON.member_logo = imgFileNameOnly;
}
}
// YouTube URL
divEmbedContainer = window.document.getElementsByClassName('embed-container');
if (divEmbedContainer[0]) {
iframeTag = divEmbedContainer[0].getElementsByTagName('iframe');
if (iframeTag[0]) {
youtubeURL = iframeTag[0].src;
membersJSON.youtube_url = youtubeURL;
}
}
yml = YAML.stringify(membersJSON);
yml = yml.replace(/^---/, "---\n");
yml = yml + "\n---" + "\n\n" + description + "\n";
//console.log(membersJSON);
console.log(yml);
writeNewFile(fileName, yml);
}
);
};
function writeNewFile(fileName, yml) {
var fileNameMarkdown = fileName.replace(/\.html$/i, ".markdown");
var fullPath = './_posts/2014-01-09-' + fileNameMarkdown;
fs.writeFile(fullPath, yml, function (err) {
if (err) throw err;
console.log(fileName + ' is saved!');
});
};
getFileList(readFile);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment