Skip to content

Instantly share code, notes, and snippets.

@saurabhvyas
Created August 23, 2017 12:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saurabhvyas/1b719f027984ea33864a15fd58bf1b9f to your computer and use it in GitHub Desktop.
Save saurabhvyas/1b719f027984ea33864a15fd58bf1b9f to your computer and use it in GitHub Desktop.
node.js file that converts each .txt in a folder having wikitext to new .txt in a new folder which has plaintext
// need to iterate over each wikipedia article raw text
var wtf_wikipedia = require("wtf_wikipedia")
fs = require('fs');
path = require('path');
var final_str=""
files = fs.readdirSync('/media/saurabh/New\ Volume/wikipedia_cs_txt/xml/');
files.forEach(function(file) {
if(path.extname(file) === ".txt") {
//do something
// console.log('.txt file')
var contents = fs.readFileSync('/media/saurabh/New\ Volume/wikipedia_cs_txt/xml/' + file, 'utf8');
if ( typeof(contents) === undefined || typeof(contents) === null ) { console.log('type err'); }
//console.log(contents);
var temp_str = wtf_wikipedia.plaintext(contents)
// console.log(temp_str) // works with console.log
//console.log(file)
//final_str = final_str + temp_str
//if ( typeof(temp_str) == undefined || typeof(temp_str) == null ) { console.log('type err'); }
fs.writeFileSync("/media/saurabh/New\ Volume/wikipedia_cs_txt/final_txt/" + file , temp_str , function(err) { if(err) {console.log('err') } })
}
else if (path.extname(file) !== ".txt") { console.log('non .txt file') }
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment