Emiel van Miltenburg evanmiltenburg

## download_book.py
import requests
import re
import time

r = requests.get('https://direct.mit.edu/books/book/5244/The-Open-Handbook-of-Linguistic-Data-Management',
                 stream=True, headers={'User-agent': 'Mozilla/5.0'})

urls = re.findall('href="(.*?.pdf)"', r.text)
base = 'https://direct.mit.edu'
urls = [base + path for path in urls if '/book/' in path]

## find_people.py
import spacy

nlp = spacy.load('nl_core_news_sm')

with open('bordewijk.txt') as f:
    doc = nlp(f.read())

people = [ent.orth_ for ent in doc.ents if ent.label_ == 'PERSON']

print(people)

## excel_dropdown_test.py
import xlsxwriter

# Create workbook with a new worksheet.
workbook = xlsxwriter.Workbook('hello.xlsx')
worksheet = workbook.add_worksheet()

# Write the tokens.
worksheet.write('A1', 'Hello')
worksheet.write('B1', 'world')
worksheet.write('C1', '!')

## lines.py
def get_lengths(num_lines, line_length):
    "Get n lines, totaling a particular length."
    lengths = np.random.random(num_lines)
    lengths *= line_length / np.sum(lengths)
    return lengths

def lines(line_length, page_width):
    "Get a random number of lines, with n-1 gaps of varying length in between."
    num_lines = np.random.randint(1,10)
    lengths = get_lengths(num_lines, line_length)

## levelt.tex
\documentclass[12pt]{standalone}
\usepackage{tgtermes}
\usepackage{tgheros}
\usepackage[T1]{fontenc}

\usepackage{tikz}
\usetikzlibrary{arrows}
\usetikzlibrary{arrows.meta}
\usetikzlibrary{calc}

## legend_circles.py
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.lines import Line2D

my_palette = sns.color_palette("cubehelix", 3)
sns.set_palette(my_palette)

def legend_circles(labels, palette, loc=1, markersize=10, marker='o', padding=0):
    "Make a legend where the color is indicated by a circle."

## inlg2017.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              1 star
            
          
                evanmiltenburg
                / inlg2017.md
            
            
              Created
              August 3, 2017 10:11
            
          
    Papers at INLG

Here's a list of all the papers presented at INLG 2017, sourced from here.
I made this list because it's easier to read and print.
Please refer to the INLG website for the official schedule, which may be subject to change,
and also contains other events, like invited talks and the hackathon.
Tuesday


## neural_modalities.py
import csv
import numpy as np
from gensim.models import Word2Vec

np.random.seed(1234)

from keras.models import Sequential
from keras.layers.core import Activation, Dense
from keras.callbacks import EarlyStopping

## utils.py
from collections import Counter
import re
import glob

class ConllEntry:
    def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
        self.id = id
        self.form = form
        self.norm = normalize(form)
        self.cpos = cpos.upper()

## dutchparser.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              1 star
            
          
                evanmiltenburg
                / dutchparser.md
            
            
              Last active
              May 25, 2016 10:39
            
          
    Training a Dutch parser

Steps


Get the text data: wget http://kyoto.let.vu.nl/~miltenburg/public_data/wikicorpus/corpus/wikicorpus.txt.gz
Get the code for the structured n-grams: wget https://github.com/wlin12/wang2vec/archive/master.zip
Run unzip master.zip ; rm master.zip
Build the word vector code: Run cd wang2vec-master/ ; make ; cd ..
Train CBOW vectors: Run ./wang2vec-master/word2vec -train wikicorpus.txt -output cbow.vectors -type 0 -size 50 -window 5 -negative 10 -nce 0 -hs 0 -sample 1e-4 -threads 1 -iter 5 -cap 0 >> training.log 2>&1 &
Train Structured skipngram vectors: Run ./wang2vec-master/word2vec -train wikicorpus.txt -output structured_ngram.vectors -type 3 -size 50 -window 5 -negative 10 -nce 0 -hs 0 -sample 1e-4 -threads 1 -iter 5 -cap 0 >> training_ssg.log 2>&1 &
	import requests
	import re
	import time

	r = requests.get('https://direct.mit.edu/books/book/5244/The-Open-Handbook-of-Linguistic-Data-Management',
	stream=True, headers={'User-agent': 'Mozilla/5.0'})

	urls = re.findall('href="(.*?.pdf)"', r.text)
	base = 'https://direct.mit.edu'
	urls = [base + path for path in urls if '/book/' in path]
	import spacy

	nlp = spacy.load('nl_core_news_sm')

	with open('bordewijk.txt') as f:
	doc = nlp(f.read())

	people = [ent.orth_ for ent in doc.ents if ent.label_ == 'PERSON']

	print(people)
	import xlsxwriter

	# Create workbook with a new worksheet.
	workbook = xlsxwriter.Workbook('hello.xlsx')
	worksheet = workbook.add_worksheet()

	# Write the tokens.
	worksheet.write('A1', 'Hello')
	worksheet.write('B1', 'world')
	worksheet.write('C1', '!')
	def get_lengths(num_lines, line_length):
	"Get n lines, totaling a particular length."
	lengths = np.random.random(num_lines)
	lengths *= line_length / np.sum(lengths)
	return lengths

	def lines(line_length, page_width):
	"Get a random number of lines, with n-1 gaps of varying length in between."
	num_lines = np.random.randint(1,10)
	lengths = get_lengths(num_lines, line_length)
	\documentclass[12pt]{standalone}
	\usepackage{tgtermes}
	\usepackage{tgheros}
	\usepackage[T1]{fontenc}

	\usepackage{tikz}
	\usetikzlibrary{arrows}
	\usetikzlibrary{arrows.meta}
	\usetikzlibrary{calc}
	import matplotlib.pyplot as plt
	import seaborn as sns

	from matplotlib.lines import Line2D

	my_palette = sns.color_palette("cubehelix", 3)
	sns.set_palette(my_palette)

	def legend_circles(labels, palette, loc=1, markersize=10, marker='o', padding=0):
	"Make a legend where the color is indicated by a circle."
	import csv
	import numpy as np
	from gensim.models import Word2Vec

	np.random.seed(1234)

	from keras.models import Sequential
	from keras.layers.core import Activation, Dense
	from keras.callbacks import EarlyStopping
	from collections import Counter
	import re
	import glob

	class ConllEntry:
	def __init__(self, id, form, pos, cpos, parent_id=None, relation=None):
	self.id = id
	self.form = form
	self.norm = normalize(form)
	self.cpos = cpos.upper()