Created
December 8, 2015 13:17
-
-
Save mamuesp/e08a4f9b484ab8a84748 to your computer and use it in GitHub Desktop.
A smal node.js based tool to extract script tags from a HTML page.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* | |
* Created by M.MUeller-Spaeth on 07.12.15. | |
* Copyright 2015 by M.Mueller-Spath, fms1961@gmail.com | |
* | |
* usage: node extract.js <HTML file> <output path> <attribute name for file names> | |
* | |
*/ | |
var fs = require('fs'); | |
var cheerio = require('cheerio'); | |
var inputFile = process.argv[2]; | |
var outputPath= (process.argv[3] + "/").replace("//", "/"); | |
var tagAttr = process.argv[4]; | |
checkPath(outputPath, true); | |
console.log("The file '" + inputFile + "' will be processed."); | |
console.log("The path '" + outputPath + "' will be the output directory."); | |
fs.readFile(inputFile, function (err, data) { | |
if (err) throw err; | |
extractScripts(data); | |
}); | |
function checkPath(path, doCreate) { | |
try { | |
var test = fs.statSync(path).isDirectory(); | |
if (!test && doCreate) { | |
fs.mkdirSync(path, 0o755); | |
return checkPath(path, false); | |
} | |
} catch (err) { | |
return false; | |
} | |
} | |
function extractScripts(data) { | |
var $ = cheerio.load(data); | |
$('script').each(function(i, element){ | |
var name = $(this).attr(tagAttr); | |
if (name) { | |
var fileName = outputPath + name + ".html"; | |
fs.writeFile(fileName, $(this).text(), function (err) { | |
if (err) { | |
return console.log(err); | |
} | |
console.log("The file '" + fileName + "' was saved!"); | |
}); | |
} | |
}); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Needed a tool which extracts data hold in script tags in a HTML file. They had an attribute I could use as file name, so the tags content would be written in an HTML file in the output path as /.html