Skip to content

Instantly share code, notes, and snippets.

@mayhewsw
Created June 27, 2019 14:23
Show Gist options
  • Save mayhewsw/3f564aa268c62189c2f1a37c0654ad90 to your computer and use it in GitHub Desktop.
Save mayhewsw/3f564aa268c62189c2f1a37c0654ad90 to your computer and use it in GitHub Desktop.
Combine two word vector text files (esp from different languages)
#!/usr/bin/python
import sys
def combine(f1, f2, outf, limit=-1, dim=64):
words1 = {}
words2 = {}
with open(f1) as f:
for i,line in enumerate(f):
if i > limit > -1:
break
sline = line.strip().split(" ", 1)
# header will have <100 chars, all other lines have more.
if len(line) < 100:
num_emb = int(sline[0])
dim1 = int(sline[1])
else:
word = sline[0]
emb = sline[1]
words1[word] = emb
with open(f2) as f:
for i,line in enumerate(f):
if i > limit > -1:
break
sline = line.strip().split(" ", 1)
# header will have <100 chars, all other lines have more.
if len(line) < 100:
num_emb = int(sline[0])
dim2 = int(sline[1])
else:
word = sline[0]
emb = sline[1]
words2[word] = emb
print(len(words1))
print(len(words2))
# combine these two, prefer vocab in words1
comb = {**words2, **words1}
with open(outf, "w") as out:
# write new header first
out.write("{} {}\n".format(len(comb), dim))
for w in comb:
out.write("{} {}\n".format(w, comb[w]))
if __name__ == "__main__":
f1 = sys.argv[1]
f2 = sys.argv[2]
outf = sys.argv[3]
if len(sys.argv) == 5:
limit = int(sys.argv[4])
else:
limit = -1
combine(f1, f2, outf, limit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment