Created
August 23, 2017 12:03
-
-
Save saurabhvyas/1b719f027984ea33864a15fd58bf1b9f to your computer and use it in GitHub Desktop.
node.js file that converts each .txt in a folder having wikitext to new .txt in a new folder which has plaintext
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// need to iterate over each wikipedia article raw text | |
var wtf_wikipedia = require("wtf_wikipedia") | |
fs = require('fs'); | |
path = require('path'); | |
var final_str="" | |
files = fs.readdirSync('/media/saurabh/New\ Volume/wikipedia_cs_txt/xml/'); | |
files.forEach(function(file) { | |
if(path.extname(file) === ".txt") { | |
//do something | |
// console.log('.txt file') | |
var contents = fs.readFileSync('/media/saurabh/New\ Volume/wikipedia_cs_txt/xml/' + file, 'utf8'); | |
if ( typeof(contents) === undefined || typeof(contents) === null ) { console.log('type err'); } | |
//console.log(contents); | |
var temp_str = wtf_wikipedia.plaintext(contents) | |
// console.log(temp_str) // works with console.log | |
//console.log(file) | |
//final_str = final_str + temp_str | |
//if ( typeof(temp_str) == undefined || typeof(temp_str) == null ) { console.log('type err'); } | |
fs.writeFileSync("/media/saurabh/New\ Volume/wikipedia_cs_txt/final_txt/" + file , temp_str , function(err) { if(err) {console.log('err') } }) | |
} | |
else if (path.extname(file) !== ".txt") { console.log('non .txt file') } | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment