aanastasiou/README.md

## README.md

      
    Raw
  

              README.md
            
          
Use GloVe to produce vector data

This will produce the vectors.txt and vectors.bin data


Use main.py to produce a new "vectors only" spacy model by using en_core_web_md as the base


Once this finishes, run spacy package inDir outDir with inDir, outDir set to ModelInputDir, WhateverOutputDir respectively.

This step will create a package that can be readily used by spacy.


From within WhateverOutputDir, execute pip install -e ./ to install the model as a module.


Use that as a model reference in python code or when using Prodigy.


## main.py
'''Athanasios Anastasiou Feb 2018

A brief script to compile vectors to Spacy models
'''

import sys
import spacy
import numpy

try:
    import en_core_web_md
except:
    sys.stdout.write("en_core_web_md not installed, it can be obtained from:https://spacy.io/usage/models\n");
    sys.exit(-1);


if __name__ == "__main__":
    nlp = en_core_web_md.load(); #This will take some time

    #This is a parameter of the whole process and it is the length of the vector for each word.
    #It seems to be NVectors+1, where NVectors is the number of vectors parameter passed to GloVe
    width = 300;

    with open("./vectors.txt") as fd:
        vectors = list(fd)

    justData = []
    #In my case, GloVe would generate some strings with spaces between then which of course interferes with "split".
    #This is taken into account here by first splitting at blanks and then taking N Vectors from the end of the list. The remaining entries are concatenated into the "key".
    #This can be optimised further of course, within an iteration
    justData = list(map(lambda x:(" ".join(x.split()[0:-width]),list(map(lambda y:float(y),x.split()[-width:]))),vectors));

    nlp.vocab.reset_vectors()
    for i, line in enumerate(justData):
        nlp.vocab.set_vector(line[0], numpy.array(line[1]))


    ##Need to alter the metadata right here
    metaData = {'author': 'Athanasios Anastasiou',
                'description': 'Tech vocab',
                'email': 'a.anastasiou@swansea.ac.uk',
                'lang': 'en',
                'license': 'CC BY-SA 3.0',
                'name': 'techVocab_test',
                'parent_package': 'spacy',
                'pipeline': ['tagger', 'parser', 'ner'],
                'spacy_version': '>=2.0.0a18',
                'version': '1.0.0'}
    nlp.meta = metaData
    #Now that the model is ready it needs to be saved
    nlp.to_disk('../ModelInputDir')
	'''Athanasios Anastasiou Feb 2018

	A brief script to compile vectors to Spacy models
	'''

	import sys
	import spacy
	import numpy

	try:
	import en_core_web_md
	except:
	sys.stdout.write("en_core_web_md not installed, it can be obtained from:https://spacy.io/usage/models\n");
	sys.exit(-1);



	if __name__ == "__main__":
	nlp = en_core_web_md.load(); #This will take some time

	#This is a parameter of the whole process and it is the length of the vector for each word.
	#It seems to be NVectors+1, where NVectors is the number of vectors parameter passed to GloVe
	width = 300;

	with open("./vectors.txt") as fd:
	vectors = list(fd)

	justData = []
	#In my case, GloVe would generate some strings with spaces between then which of course interferes with "split".
	#This is taken into account here by first splitting at blanks and then taking N Vectors from the end of the list. The remaining entries are concatenated into the "key".
	#This can be optimised further of course, within an iteration
	justData = list(map(lambda x:(" ".join(x.split()[0:-width]),list(map(lambda y:float(y),x.split()[-width:]))),vectors));

	nlp.vocab.reset_vectors()
	for i, line in enumerate(justData):
	nlp.vocab.set_vector(line[0], numpy.array(line[1]))


	##Need to alter the metadata right here
	metaData = {'author': 'Athanasios Anastasiou',
	'description': 'Tech vocab',
	'email': 'a.anastasiou@swansea.ac.uk',
	'lang': 'en',
	'license': 'CC BY-SA 3.0',
	'name': 'techVocab_test',
	'parent_package': 'spacy',
	'pipeline': ['tagger', 'parser', 'ner'],
	'spacy_version': '>=2.0.0a18',
	'version': '1.0.0'}
	nlp.meta = metaData
	#Now that the model is ready it needs to be saved
	nlp.to_disk('../ModelInputDir')