Kyle Gorman kylebgorman

## pairs.tsv
フルーツサラダ	fruits salad
クリッパーチップ	clipper chip
ライフサイクル	life cycle
ボイストレーニング	voice training
オップアート	op art
ノーズコーン	nose cone
インカムタックス	income tax
エグゼクティブフロア	executive floor
ウェブフォーム	web form
ハムサンド	ham sand

## rubert-embedding.py
#!/usr/bin/env python

# Documented in: https://metatext.io/models/DeepPavlov-rubert-base-cased

import transformers


model_name = "DeepPavlov/rubert-base-cased"
model = transformers.AutoModel.from_pretrained(model_name)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

## asciify.pl
#!/usr/bin/perl

use strict;
use warnings;

use Unicode::Normalize;

use open ":encoding(utf8)";
binmode STDIN, ":encoding(utf8)";
binmode STDOUT, ":encoding(ascii)";

## 95to27.pl
#!/usr/bin/perl

use strict;
use warnings;

use open ":encoding(ascii)";
binmode STDIN, ":encoding(ascii)";
binmode STDOUT, ":encoding(ascii)";
binmode STDERR, ":encoding(ascii)";

## sgml2docs.py
#!/usr/bin/env python
"""Extracts documents from the Gigaword SGML."""


import argparse
import logging
import os

import bs4

## LING78100-lecture02.ipynb

      
              1 file
            
          
              0 forks
            
          
              1 comment
            
          
              0 stars
            
          
                kylebgorman
                / LING78100-lecture02.ipynb
            
            
              Created
              September 18, 2019 14:49
            
              
                LING78100 Lecture 2
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## lnre.py
#!/usr/bin/env python
"""LNRE calculator.

This script computes a number of statistics characterizing LNRE data:

* N: corpus size
* V: vocabulary size
* V(1): the number of _hapax legomena_ (symbols occuring once)
* V(2): the number of _dis legomena_ (symbols occurring twice)
* V/N: vocabulary growth rate

## byte.sym
<epsilon>	0
<SOH>	1
<STX>	2
<ETX>	3
<EOT>	4
<ENQ>	5
<ACK>	6
<BEL>	7
<BS>	8
<HT>	9

## casefold.py
#!/usr/bin/env python

import fileinput

import nltk


if __name__ == "__main__":
    for line in fileinput.input():
        print(line.rstrip().casefold())

## word_tokenize.py
#!/usr/bin/env python

import fileinput

import nltk


if __name__ == "__main__":
    for line in fileinput.input():
        print(" ".join(nltk.word_tokenize(line)))
	フルーツサラダ fruits salad
	クリッパーチップ clipper chip
	ライフサイクル life cycle
	ボイストレーニング voice training
	オップアート op art
	ノーズコーン nose cone
	インカムタックス income tax
	エグゼクティブフロア executive floor
	ウェブフォーム web form
	ハムサンド ham sand
	#!/usr/bin/env python

	# Documented in: https://metatext.io/models/DeepPavlov-rubert-base-cased

	import transformers


	model_name = "DeepPavlov/rubert-base-cased"
	model = transformers.AutoModel.from_pretrained(model_name)
	tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
	#!/usr/bin/perl

	use strict;
	use warnings;

	use Unicode::Normalize;

	use open ":encoding(utf8)";
	binmode STDIN, ":encoding(utf8)";
	binmode STDOUT, ":encoding(ascii)";
	#!/usr/bin/env python
	"""Extracts documents from the Gigaword SGML."""


	import argparse
	import logging
	import os

	import bs4
	#!/usr/bin/env python
	"""LNRE calculator.

	This script computes a number of statistics characterizing LNRE data:

	* N: corpus size
	* V: vocabulary size
	* V(1): the number of _hapax legomena_ (symbols occuring once)
	* V(2): the number of _dis legomena_ (symbols occurring twice)
	* V/N: vocabulary growth rate
	<epsilon> 0
	<SOH> 1
	<STX> 2
	<ETX> 3
	<EOT> 4
	<ENQ> 5
	<ACK> 6
	<BEL> 7
	<BS> 8
	<HT> 9
	#!/usr/bin/env python

	import fileinput

	import nltk


	if __name__ == "__main__":
	for line in fileinput.input():
	print(line.rstrip().casefold())