Skip to content

Instantly share code, notes, and snippets.

@Hugo-ter-Doest
Last active June 4, 2018 22:47
Show Gist options
  • Save Hugo-ter-Doest/2bf6d1619ec378d473607dba14b47c04 to your computer and use it in GitHub Desktop.
Save Hugo-ter-Doest/2bf6d1619ec378d473607dba14b47c04 to your computer and use it in GitHub Desktop.
Process French tree bank to create sentences that are POS tagged
var fs = require("fs");
xml2js = require('xml2js');
var Sentence = require('./Sentence');
// Folder with corpus
var corpusPath = "/home/hugo/Workspace/FrenchTreeBank/originXML/current/corpus-constit/";
// Documentation of the treebank:
// http://ftb.linguist.univ-paris-diderot.fr/treebank.php?fichier=documentation&langue=en
//
// Part of speech tags:
// A (adjective)
// Adv (adverb)
// C (conjunction): coordinating (Cc) or subordinated (Cs)
// Cl (weak clitic pronoun)
// D (determiner)
// ET (foreign word)
// I (interjection)
// N (noun) : Nc (common noun) or Np (nom proper noun)
// P (preposition)
// PREF (prefix)
// PRO (strong pronoun)
// V (verb)
// PONCT (punctuation mark)
var partOfSpeechTags = ["A",
"Adv",
"C",
"Cc",
"Cs",
"Cl",
"D",
"ET",
"I",
"N",
"Nc",
"Np",
"P",
"PREF",
"PRO",
"V",
"PONCT"];
// Phrasal tags:
// AP (adjectival phrases)
// AdP (adverbial phrases)
// COORD (coordinated phrases)
// NP (noun phrases)
// PP (prepositional phrases)
// VN (verbal nucleus)
// VPinf (infinitive clauses)
// VPpart (nonfinite clauses)
// SENT (sentences)
// Sint, Srel, Ssub (finite clauses)
var phrasalTagSet = ["AP",
"AdP",
"COORD",
"NP",
"PP",
"VN",
"VPinf",
"VPpart",
"SENT",
"Sint",
"Srel",
"Ssub"];
function processSentence(s, sentenceObject) {
//console.log(JSON.stringify(s, null, 2));
// Traverse tree depth first
Object.keys(s).forEach(tag => {
if (tag != "$") {
if (tag === "w") {
// Add (word, tag) to sentence
s["w"].forEach(o => {
sentenceObject.addTaggedWord(o["_"], o["$"]["cat"]);
});
}
else {
// Go deeper
s[tag].forEach(o => {
processSentence(o, sentenceObject);
});
}
}
});
}
function processObject(object) {
if (object["text"]) {
if (object["text"]["SENT"]) {
var sentenceObject = null;
object["text"]["SENT"].forEach(sentence => {
sentenceObject = new Sentence();
processSentence(sentence, sentenceObject);
console.log(JSON.stringify(sentenceObject, null, 2));
})
}
}
}
function processFile(file) {
var text = fs.readFileSync(file, 'utf8');
var parser = new xml2js.Parser();
parser.parseString(text, function (err, result) {
processObject(result);
});
}
function processCorpus(folder) {
fs.readdirSync(folder).forEach(file => {
processFile(folder + file);
});
}
processCorpus(corpusPath);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment