Stephen Mayhew mayhewsw

## get_size.sh
DUMP_DATE="20201001"

while read lng; do
    URL="https://dumps.wikimedia.org/${lng}wiki/${DUMP_DATE}/${lng}wiki-${DUMP_DATE}-pages-articles.xml.bz2"
    #echo $lng
    curl -sI $URL | grep -i Content-Length | awk -v l="$lng" '{print l " " $2}'
done < langs.txt

## flair_char_indexer.py
from typing import Dict, List, Tuple, Any

from flair.data import Dictionary
from flair.embeddings import FlairEmbeddings
from overrides import overrides

from allennlp.common.checks import ConfigurationError
from allennlp.common.util import pad_sequence_to_length
from allennlp.data.tokenizers.token import Token
from allennlp.data.token_indexers.token_indexer import TokenIndexer

## combine.py
#!/usr/bin/python
import sys

def combine(f1, f2, outf, limit=-1, dim=64):
    words1 = {}
    words2 = {}

    with open(f1) as f:
        for i,line in enumerate(f):
            if i > limit > -1:

## display_pytorch_matrics.py
import torch
import matplotlib.pyplot as plt
import numpy as np

# I learned this one in allennlp, hence the name.
p = "path/to/model/best.th"
w = torch.load(p)

for k in w.keys():
    print(k)

## reader.java
import edu.illinois.cs.cogcomp.core.algorithms.LevensteinDistance;
import edu.illinois.cs.cogcomp.core.datastructures.Pair;
import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotationUtilities;
import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View;
import edu.illinois.cs.cogcomp.core.io.LineIO;
import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
import org.apache.commons.io.FilenameUtils;

## allennlp_vocab_test.ipynb

      
              5 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                mayhewsw
                / allennlp_vocab_test.ipynb
            
            
              Last active
              April 5, 2019 22:40
            
              
                Allennlp Vocabulary Tests.
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## vector_select.sh
# Number of vector lines you want
N=50000

IN=$1
OUT=$2

# Get the dimension from the header.
DIM=$(head -n 1 $IN | cut -d' ' -f2)

# Actually take the top...

## ner_allennlp_test.py
from allennlp.predictors.predictor import Predictor
import time

model = "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.04.26.tar.gz"

print("Loading model...")

## codenames.py
# coding: utf-8
from gensim.models import KeyedVectors
# Downloaded from fasttext: https://fasttext.cc/docs/en/english-vectors.html
# Converted to word2vec binary format for faster loading (see convert.py)
vec = KeyedVectors.load_word2vec_format("~/data/wiki-news-300d-1M.vec.bin", binary=True)
from itertools import combinations
from nltk.stem import WordNetLemmatizer,PorterStemmer

wnl = WordNetLemmatizer()
stemmer = PorterStemmer()

## translate.py
#!/usr/bin/python
from googleapiclient.discovery import build
import codecs
import HTMLParser
import shelve

# As of Aug 1 2016
API_KEY = "YOUR_API_KEY_HERE"
	DUMP_DATE="20201001"

	while read lng; do
	URL="https://dumps.wikimedia.org/${lng}wiki/${DUMP_DATE}/${lng}wiki-${DUMP_DATE}-pages-articles.xml.bz2"
	#echo $lng
	curl -sI $URL \| grep -i Content-Length \| awk -v l="$lng" '{print l " " $2}'
	done < langs.txt
	from typing import Dict, List, Tuple, Any

	from flair.data import Dictionary
	from flair.embeddings import FlairEmbeddings
	from overrides import overrides

	from allennlp.common.checks import ConfigurationError
	from allennlp.common.util import pad_sequence_to_length
	from allennlp.data.tokenizers.token import Token
	from allennlp.data.token_indexers.token_indexer import TokenIndexer
	#!/usr/bin/python
	import sys

	def combine(f1, f2, outf, limit=-1, dim=64):
	words1 = {}
	words2 = {}

	with open(f1) as f:
	for i,line in enumerate(f):
	if i > limit > -1:
	import torch
	import matplotlib.pyplot as plt
	import numpy as np

	# I learned this one in allennlp, hence the name.
	p = "path/to/model/best.th"
	w = torch.load(p)

	for k in w.keys():
	print(k)
	import edu.illinois.cs.cogcomp.core.algorithms.LevensteinDistance;
	import edu.illinois.cs.cogcomp.core.datastructures.Pair;
	import edu.illinois.cs.cogcomp.core.datastructures.ViewNames;
	import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Sentence;
	import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation;
	import edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotationUtilities;
	import edu.illinois.cs.cogcomp.core.datastructures.textannotation.View;
	import edu.illinois.cs.cogcomp.core.io.LineIO;
	import edu.illinois.cs.cogcomp.core.utilities.SerializationHelper;
	import org.apache.commons.io.FilenameUtils;
	# Number of vector lines you want
	N=50000

	IN=$1
	OUT=$2

	# Get the dimension from the header.
	DIM=$(head -n 1 $IN \| cut -d' ' -f2)

	# Actually take the top...
	from allennlp.predictors.predictor import Predictor
	import time

	model = "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.04.26.tar.gz"

	print("Loading model...")
	# coding: utf-8
	from gensim.models import KeyedVectors
	# Downloaded from fasttext: https://fasttext.cc/docs/en/english-vectors.html
	# Converted to word2vec binary format for faster loading (see convert.py)
	vec = KeyedVectors.load_word2vec_format("~/data/wiki-news-300d-1M.vec.bin", binary=True)
	from itertools import combinations
	from nltk.stem import WordNetLemmatizer,PorterStemmer

	wnl = WordNetLemmatizer()
	stemmer = PorterStemmer()
	#!/usr/bin/python
	from googleapiclient.discovery import build
	import codecs
	import HTMLParser
	import shelve

	# As of Aug 1 2016
	API_KEY = "YOUR_API_KEY_HERE"