Last active June 21, 2017 20:30
var conllu = require('conllu-stream');
var fs = require('fs');
.on('data', sentence => {
console.log(sentence.features.sent_id, sentence.toString());
var _ = require('lodash');
var conllu = require('conllu-stream');
var fs = require('fs');
// Function to print value as percent (nicely).
function percent(value) {
value *= 100;
return isNaN(value) ? '' :
value > 10 ? value.toPrecision(3)+'%' :
value > 1 ? value.toPrecision(2)+'%' :
// Function to calculate and display histogram.
// It first calculates the histogram of the primary `key`.
// For each key it calculates a secondary histogram of the seconday `linkKey`.
function displayHistogram(words, key, linkKey) {
// Calculate `key` histogram of words, sorted by the frequenzy.
var grouped = _.groupBy(words, key);
var histogram = _(grouped)
.sortBy([ 1, 0 ])
// Get top-10 and bottom-10 parts of the histogram.
var top = histogram.slice(0, 10);
var bottom = histogram.slice(-10);
// Combine top and bottom parts to display.
var entries = top.concat([['--']]).concat(bottom);
// For each entry, calculate top-6 of secondary `linkKey` histogram.
entries.forEach(entry => {
entry[2] = percent(entry[1] / words.length);
var linked =[entry[0]], linkKey);
entry[3] =
.sortBy([ 1, 0 ])
// Show percent values for each item.
.map(item => `${item[0]} (${percent(item[1]/linked.length)})`)
.slice(0, 6)
.join(', ');
// Display table of results.
console.log('-- %s --', key);
console.log( => entry.join('\t')).join('\n'));
console.log('#words :', words.length);
console.log('#histogram :', histogram.length);
// Array to store all word objects we encounter.
var words = [];
// Parse CoNLL-U file.
.on('data', sentence => {
// Collect all words and lemmas in lowercase (except punctuation/numbers).
.filter(word => [ 'PUNCT', 'NUM' ].indexOf(word.upostag) === -1)
.forEach(word => {
// Using lowercase so "Haus" and "haus" are counted together.
word.form = word.form.toLowerCase();
word.lemma = word.lemma.toLowerCase();
.on('end', () => {
// Calculate and display histograms of words vs. lemmas and vice versa.
displayHistogram(words, 'form', 'lemma');
displayHistogram(words, 'lemma', 'form');
console.log('-- done --');
var _ = require('lodash');
var conllu = require('conllu-stream');
var fs = require('fs');
var multiwords = [];
.on('data', sentence => {
// Collect all words and lemmas in lowercase.
.map(id => sentence.tokens[id])
.forEach(multiword => {
// Get expanded form of the multiword.
var expansion =
_.range(multiword.position, multiword.endPosition+1)
.map(id => sentence.tokens[''+id].form)
.join(' ');
// Store multiword and its expansion.
multiwords.push(multiword.form.toLowerCase() +
'\t-->\t' + expansion.toLowerCase());
.on('end', () => {
// Calculate and show histogram sorted by frequency.
.sortBy([ 1, 0 ])
.map(row => row.join('\t\t'))
"dependencies": {
"conllu-stream": "0.0.1",
"lodash": "^4.17.4"
# sent_id = train-s2
# text = Die Kosten sind definitiv auch im Rahmen.
#id form lemma upostag xpostag feats head deprel deps misc
1 Die der DET ART Case=Nom|Definite=Def|Gender=Fem|Number=Sing|PronType=Art 2 det _ _
2 Kosten Kosten NOUN NN Case=Nom|Gender=Fem|Number=Sing 3 nsubj:pass _ _
3 sind sein VERB VAFIN Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _
4 definitiv definitiv ADV ADJD _ 3 advmod _ _
5 auch auch ADV ADV _ 3 advmod _ _
6-7 im _ _ _ _ _ _ _ _
6 in in ADP APPR _ 8 case _ _
7 dem der DET ART Case=Dat|Definite=Def|Gender=Masc,Neut|Number=Sing|PronType=Art 8 det _ _
8 Rahmen Rahmen NOUN NN Case=Dat|Gender=Masc,Neut|Number=Sing 3 obl _ SpaceAfter=No
9 . . PUNCT $. _ 3 punct _ _
