Skip to content

Instantly share code, notes, and snippets.

@walling
Last active June 21, 2017 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save walling/e90c3e1042e1760d426a7008195894a8 to your computer and use it in GitHub Desktop.
Save walling/e90c3e1042e1760d426a7008195894a8 to your computer and use it in GitHub Desktop.
blog-post-2017-06-21-examples
var conllu = require('conllu-stream');
var fs = require('fs');
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
.pipe(conllu())
.on('data', sentence => {
console.log(sentence.features.sent_id, sentence.toString());
});
var _ = require('lodash');
var conllu = require('conllu-stream');
var fs = require('fs');
// Function to print value as percent (nicely).
function percent(value) {
value *= 100;
return isNaN(value) ? '' :
value > 10 ? value.toPrecision(3)+'%' :
value > 1 ? value.toPrecision(2)+'%' :
value.toPrecision(1)+'%';
}
// Function to calculate and display histogram.
// It first calculates the histogram of the primary `key`.
// For each key it calculates a secondary histogram of the seconday `linkKey`.
function displayHistogram(words, key, linkKey) {
// Calculate `key` histogram of words, sorted by the frequenzy.
var grouped = _.groupBy(words, key);
var histogram = _(grouped)
.mapValues('length')
.toPairs()
.sortBy([ 1, 0 ])
.reverse()
.value();
// Get top-10 and bottom-10 parts of the histogram.
var top = histogram.slice(0, 10);
var bottom = histogram.slice(-10);
// Combine top and bottom parts to display.
var entries = top.concat([['--']]).concat(bottom);
// For each entry, calculate top-6 of secondary `linkKey` histogram.
entries.forEach(entry => {
entry[2] = percent(entry[1] / words.length);
var linked = _.map(grouped[entry[0]], linkKey);
entry[3] =
_(linked)
.groupBy()
.mapValues('length')
.toPairs()
.sortBy([ 1, 0 ])
.reverse()
// Show percent values for each item.
.map(item => `${item[0]} (${percent(item[1]/linked.length)})`)
.slice(0, 6)
.join(', ');
});
// Display table of results.
console.log('-- %s --', key);
console.log();
console.log(entries.map(entry => entry.join('\t')).join('\n'));
console.log();
console.log('#words :', words.length);
console.log('#histogram :', histogram.length);
console.log();
}
// Array to store all word objects we encounter.
var words = [];
// Parse CoNLL-U file.
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
.pipe(conllu())
.on('data', sentence => {
// Collect all words and lemmas in lowercase (except punctuation/numbers).
sentence.getSequence()
.filter(word => [ 'PUNCT', 'NUM' ].indexOf(word.upostag) === -1)
.forEach(word => {
// Using lowercase so "Haus" and "haus" are counted together.
word.form = word.form.toLowerCase();
word.lemma = word.lemma.toLowerCase();
words.push(word);
});
})
.on('end', () => {
// Calculate and display histograms of words vs. lemmas and vice versa.
displayHistogram(words, 'form', 'lemma');
displayHistogram(words, 'lemma', 'form');
console.log('-- done --');
});
var _ = require('lodash');
var conllu = require('conllu-stream');
var fs = require('fs');
var multiwords = [];
fs.createReadStream('ud-treebanks-v2.0/UD_German/de-ud-train.conllu')
.pipe(conllu())
.on('data', sentence => {
// Collect all words and lemmas in lowercase.
sentence.structure.multiwords
.map(id => sentence.tokens[id])
.forEach(multiword => {
// Get expanded form of the multiword.
var expansion =
_.range(multiword.position, multiword.endPosition+1)
.map(id => sentence.tokens[''+id].form)
.join(' ');
// Store multiword and its expansion.
multiwords.push(multiword.form.toLowerCase() +
'\t-->\t' + expansion.toLowerCase());
});
})
.on('end', () => {
// Calculate and show histogram sorted by frequency.
console.log(
_(multiwords)
.groupBy()
.mapValues('length')
.toPairs()
.sortBy([ 1, 0 ])
.reverse()
.map(row => row.join('\t\t'))
.join('\n')
);
});
{
"dependencies": {
"conllu-stream": "0.0.1",
"lodash": "^4.17.4"
}
}
# sent_id = train-s2
# text = Die Kosten sind definitiv auch im Rahmen.
#id form lemma upostag xpostag feats head deprel deps misc
1 Die der DET ART Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ _
2 Kosten Kosten NOUN NN Case=Nom|Gender=Fem|Number=Sing 3 nsubj:pass _ _
3 sind sein VERB VAFIN Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _
4 definitiv definitiv ADV ADJD _ 3 advmod _ _
5 auch auch ADV ADV _ 3 advmod _ _
6-7 im _ _ _ _ _ _ _ _
6 in in ADP APPR _ 8 case _ _
7 dem der DET ART Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art 8 det _ _
8 Rahmen Rahmen NOUN NN Case=Dat|Gender=Masc,Neut|Number=Sing 3 obl _ SpaceAfter=No
9 . . PUNCT $. _ 3 punct _ _
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment