Last active
June 4, 2018 22:47
-
-
Save Hugo-ter-Doest/2bf6d1619ec378d473607dba14b47c04 to your computer and use it in GitHub Desktop.
Process French tree bank to create sentences that are POS tagged
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require("fs"); | |
xml2js = require('xml2js'); | |
var Sentence = require('./Sentence'); | |
// Folder with corpus | |
var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/"; | |
// Documentation of the treebank: | |
// http://ftb.linguist.univ-paris-diderot.fr/treebank.php?fichier=documentation&langue=en | |
// | |
// Part of speech tags: | |
// A (adjective) | |
// Adv (adverb) | |
// C (conjunction): coordinating (Cc) or subordinated (Cs) | |
// Cl (weak clitic pronoun) | |
// D (determiner) | |
// ET (foreign word) | |
// I (interjection) | |
// N (noun) : Nc (common noun) or Np (nom proper noun) | |
// P (preposition) | |
// PREF (prefix) | |
// PRO (strong pronoun) | |
// V (verb) | |
// PONCT (punctuation mark) | |
var partOfSpeechTags = ["A", | |
"Adv", | |
"C", | |
"Cc", | |
"Cs", | |
"Cl", | |
"D", | |
"ET", | |
"I", | |
"N", | |
"Nc", | |
"Np", | |
"P", | |
"PREF", | |
"PRO", | |
"V", | |
"PONCT"]; | |
// Phrasal tags: | |
// AP (adjectival phrases) | |
// AdP (adverbial phrases) | |
// COORD (coordinated phrases) | |
// NP (noun phrases) | |
// PP (prepositional phrases) | |
// VN (verbal nucleus) | |
// VPinf (infinitive clauses) | |
// VPpart (nonfinite clauses) | |
// SENT (sentences) | |
// Sint, Srel, Ssub (finite clauses) | |
var phrasalTagSet = ["AP", | |
"AdP", | |
"COORD", | |
"NP", | |
"PP", | |
"VN", | |
"VPinf", | |
"VPpart", | |
"SENT", | |
"Sint", | |
"Srel", | |
"Ssub"]; | |
function processSentence(s, sentenceObject) { | |
//console.log(JSON.stringify(s, null, 2)); | |
// Traverse tree depth first | |
Object.keys(s).forEach(tag => { | |
if (tag != "$") { | |
if (tag === "w") { | |
// Add (word, tag) to sentence | |
s["w"].forEach(o => { | |
sentenceObject.addTaggedWord(o["_"], o["$"]["cat"]); | |
}); | |
} | |
else { | |
// Go deeper | |
s[tag].forEach(o => { | |
processSentence(o, sentenceObject); | |
}); | |
} | |
} | |
}); | |
} | |
function processObject(object) { | |
if (object["text"]) { | |
if (object["text"]["SENT"]) { | |
var sentenceObject = null; | |
object["text"]["SENT"].forEach(sentence => { | |
sentenceObject = new Sentence(); | |
processSentence(sentence, sentenceObject); | |
console.log(JSON.stringify(sentenceObject, null, 2)); | |
}) | |
} | |
} | |
} | |
function processFile(file) { | |
var text = fs.readFileSync(file, 'utf8'); | |
var parser = new xml2js.Parser(); | |
parser.parseString(text, function (err, result) { | |
processObject(result); | |
}); | |
} | |
function processCorpus(folder) { | |
fs.readdirSync(folder).forEach(file => { | |
processFile(folder + file); | |
}); | |
} | |
processCorpus(corpusPath); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment