mayhewsw/allennlp_vocab_test.ipynb

## allennlp_vocab_test.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from allennlp.data.vocabulary import Vocabulary\n",
    "from allennlp.data.dataset_readers import Conll2003DatasetReader\n",
    "from allennlp.data.token_indexers import SingleIdTokenIndexer\n",
    "import random\n",
    "\n",
    "## Make the embeddings first.\n",
    "def getwords(path, w):\n",
    "    with open(path) as f:\n",
    "        for line in f:\n",
    "            sline = line.split()\n",
    "            if len(sline) > 0:\n",
    "                w.add(sline[0])\n",
    "\n",
    "# these embeddings contain *every* word in the three datasets\n",
    "words = set()\n",
    "getwords(\"train.txt\", words)\n",
    "getwords(\"dev.txt\", words)\n",
    "getwords(\"test.txt\", words)\n",
    "\n",
    "dim = 3\n",
    "with open(\"myembs.txt\", \"w\") as out:\n",
    "    out.write(\"{} {}\\n\".format(len(words), dim))\n",
    "    for word in words:\n",
    "        # random 3 dimensional embeddings.\n",
    "        out.write(word + \" \" + \" \".join([str(random.random())]*dim) + \"\\n\")\n",
    "        \n",
    "\n",
    "reader = Conll2003DatasetReader()\n",
    "\n",
    "# gather all data\n",
    "train_dataset = reader.read(\"train.txt\")\n",
    "dev_dataset = reader.read(\"dev.txt\")\n",
    "test_dataset = reader.read(\"test.txt\")\n",
    "\n",
    "# I had thought that the pretrained file would extend the vocabulary, but this is not the case.\n",
    "# this vocab has instances only from train and dev.\n",
    "vocab1 = Vocabulary.from_instances(train_dataset + dev_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n",
    "# this vocab has instances from train, dev, and test.\n",
    "vocab2 = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n",
    "\n",
    "# this has 16 elements\n",
    "print(vocab1)\n",
    "# this has 23 elements\n",
    "print(vocab2)\n",
    "\n",
    "# Build an indexer based on vocab1 and vocab2\n",
    "indexer = SingleIdTokenIndexer()\n",
    "\n",
    "# vocab1 does not contain test data, so these will be all UNK\n",
    "ind1 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab1, \"tmp\")\n",
    "print([vocab1.get_token_from_index(i) for i in ind1[\"tmp\"]])\n",
    "\n",
    "# vocab2 does contain test data, so these will be all correct.\n",
    "ind2 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab2, \"tmp\")\n",
    "print([vocab2.get_token_from_index(i) for i in ind2[\"tmp\"]])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## dev.txt
philly O O I-MNT
is O O O
where O O O
harold O O I-MNT
johnson O O I-MNT
lives O O O
. O O O

same O O O
for O O O
bob O O I-MNT
jones O O I-MNT
. O O O

## requirements.txt
allennlp==0.8.3

## test.txt
jane O O I-MNT
null O O I-MNT
and O O O
nancy O O I-MNT
smith O O I-MNT
are O O O
nearby O O O
. O O O

## train.txt
harold O O I-MNT
johnson O O I-MNT
lives O O O
in O O O
philly O O I-MNT
. O O O

bob O O I-MNT
jones O O I-MNT
does O O O
too O O O
. O O O
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"from allennlp.data.vocabulary import Vocabulary\n",
	"from allennlp.data.dataset_readers import Conll2003DatasetReader\n",
	"from allennlp.data.token_indexers import SingleIdTokenIndexer\n",
	"import random\n",
	"\n",
	"## Make the embeddings first.\n",
	"def getwords(path, w):\n",
	" with open(path) as f:\n",
	" for line in f:\n",
	" sline = line.split()\n",
	" if len(sline) > 0:\n",
	" w.add(sline[0])\n",
	"\n",
	"# these embeddings contain every word in the three datasets\n",
	"words = set()\n",
	"getwords(\"train.txt\", words)\n",
	"getwords(\"dev.txt\", words)\n",
	"getwords(\"test.txt\", words)\n",
	"\n",
	"dim = 3\n",
	"with open(\"myembs.txt\", \"w\") as out:\n",
	" out.write(\"{} {}\\n\".format(len(words), dim))\n",
	" for word in words:\n",
	" # random 3 dimensional embeddings.\n",
	" out.write(word + \" \" + \" \".join([str(random.random())]*dim) + \"\\n\")\n",
	" \n",
	"\n",
	"reader = Conll2003DatasetReader()\n",
	"\n",
	"# gather all data\n",
	"train_dataset = reader.read(\"train.txt\")\n",
	"dev_dataset = reader.read(\"dev.txt\")\n",
	"test_dataset = reader.read(\"test.txt\")\n",
	"\n",
	"# I had thought that the pretrained file would extend the vocabulary, but this is not the case.\n",
	"# this vocab has instances only from train and dev.\n",
	"vocab1 = Vocabulary.from_instances(train_dataset + dev_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n",
	"# this vocab has instances from train, dev, and test.\n",
	"vocab2 = Vocabulary.from_instances(train_dataset + dev_dataset + test_dataset, pretrained_files={\"tokens\" : \"myembs.txt\"})\n",
	"\n",
	"# this has 16 elements\n",
	"print(vocab1)\n",
	"# this has 23 elements\n",
	"print(vocab2)\n",
	"\n",
	"# Build an indexer based on vocab1 and vocab2\n",
	"indexer = SingleIdTokenIndexer()\n",
	"\n",
	"# vocab1 does not contain test data, so these will be all UNK\n",
	"ind1 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab1, \"tmp\")\n",
	"print([vocab1.get_token_from_index(i) for i in ind1[\"tmp\"]])\n",
	"\n",
	"# vocab2 does contain test data, so these will be all correct.\n",
	"ind2 = indexer.tokens_to_indices(test_dataset[0][\"tokens\"], vocab2, \"tmp\")\n",
	"print([vocab2.get_token_from_index(i) for i in ind2[\"tmp\"]])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	philly O O I-MNT
	is O O O
	where O O O
	harold O O I-MNT
	johnson O O I-MNT
	lives O O O
	. O O O

	same O O O
	for O O O
	bob O O I-MNT
	jones O O I-MNT
	. O O O
	jane O O I-MNT
	null O O I-MNT
	and O O O
	nancy O O I-MNT
	smith O O I-MNT
	are O O O
	nearby O O O
	. O O O