mayhewsw/combine.py

## combine.py
#!/usr/bin/python
import sys

def combine(f1, f2, outf, limit=-1, dim=64):
    words1 = {}
    words2 = {}

    with open(f1) as f:
        for i,line in enumerate(f):
            if i > limit > -1:
                break

            sline = line.strip().split(" ", 1)
            # header will have <100 chars, all other lines have more.
            if len(line) < 100:
                num_emb = int(sline[0])
                dim1 = int(sline[1])
            else:
                word = sline[0]
                emb = sline[1]
                words1[word] = emb

    with open(f2) as f:
        for i,line in enumerate(f):
            if i > limit > -1:
                break
            sline = line.strip().split(" ", 1)
            # header will have <100 chars, all other lines have more.
            if len(line) < 100:
                num_emb = int(sline[0])
                dim2 = int(sline[1])
            else:
                word = sline[0]
                emb = sline[1]
                words2[word] = emb

    print(len(words1))
    print(len(words2))

    # combine these two, prefer vocab in words1
    comb = {**words2, **words1}

    with open(outf, "w") as out:
        # write new header first
        out.write("{} {}\n".format(len(comb), dim))
        for w in comb:
            out.write("{} {}\n".format(w, comb[w]))

if __name__ == "__main__":
    f1 = sys.argv[1]
    f2 = sys.argv[2]
    outf = sys.argv[3]
    if len(sys.argv) == 5:
        limit = int(sys.argv[4])
    else:
        limit = -1
    combine(f1, f2, outf, limit)
	#!/usr/bin/python
	import sys

	def combine(f1, f2, outf, limit=-1, dim=64):
	words1 = {}
	words2 = {}

	with open(f1) as f:
	for i,line in enumerate(f):
	if i > limit > -1:
	break

	sline = line.strip().split(" ", 1)
	# header will have <100 chars, all other lines have more.
	if len(line) < 100:
	num_emb = int(sline[0])
	dim1 = int(sline[1])
	else:
	word = sline[0]
	emb = sline[1]
	words1[word] = emb

	with open(f2) as f:
	for i,line in enumerate(f):
	if i > limit > -1:
	break
	sline = line.strip().split(" ", 1)
	# header will have <100 chars, all other lines have more.
	if len(line) < 100:
	num_emb = int(sline[0])
	dim2 = int(sline[1])
	else:
	word = sline[0]
	emb = sline[1]
	words2[word] = emb

	print(len(words1))
	print(len(words2))

	# combine these two, prefer vocab in words1
	comb = {words2, words1}

	with open(outf, "w") as out:
	# write new header first
	out.write("{} {}\n".format(len(comb), dim))
	for w in comb:
	out.write("{} {}\n".format(w, comb[w]))

	if __name__ == "__main__":
	f1 = sys.argv[1]
	f2 = sys.argv[2]
	outf = sys.argv[3]
	if len(sys.argv) == 5:
	limit = int(sys.argv[4])
	else:
	limit = -1
	combine(f1, f2, outf, limit)