Skip to content

Instantly share code, notes, and snippets.

@callahantiff
Last active May 13, 2021 17:52
Show Gist options
  • Save callahantiff/9c42f14699a45d2e43cac154947fcd84 to your computer and use it in GitHub Desktop.
Save callahantiff/9c42f14699a45d2e43cac154947fcd84 to your computer and use it in GitHub Desktop.
Sparse Node2Vec Wrapper for Large Networks
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# script created using: https://github.com/VHRanger/nodevectors
# import needed libraries
import argparse
import csrgraph as cg
import nodevectors
def main():
parser = argparse.ArgumentParser(description='A wrapper for running Node2Vec on Very Large Graphs')
parser.add_argument('-e', '--edgelist', help='Name/path to text file containing graph edge list', required=True)
parser.add_argument('-d', '--dim', help='Embedding dimensions', required=True)
parser.add_argument('-l', '--walklen', help='Random walk length', required=True)
parser.add_argument('-r', '--walknum', help='Number of walks', required=True)
parser.add_argument('-t', '--threads', help='# threads to use', default=0)
parser.add_argument('-p', '--return_weight', help='Return node probability', default=1.)
parser.add_argument('-q', '--explore_weight', help='Node visit probability', default=1.)
parser.add_argument('-k', '--window', help='Context window size', required=True)
parser.add_argument('-w', '--keep_walks', help='Save the random walks', default=False)
parser.add_argument('-m', '--save_model', help='Save Gensim node2vec model', default=False)
args = parser.parse_args()
# print user parameters to console
print('\n#######################################################################\n')
print('NODE2VEC Parameters:')
print('Edge List: {input_file}'.format(input_file=args.edgelist.split('/')[-1]))
print('Embedding Dimensions: {dim}'.format(dim=args.dim))
print('Random walk Length: {walk_len}'.format(walk_len=args.walklen))
print('Number of random walks: {walk_num}'.format(walk_num=args.walknum))
print('Threads: {threads}'.format(threads=args.threads))
print('Return Weight (p): {p}'.format(p=args.return_weight))
print('Explore Weight (q): {q}'.format(q=args.explore_weight))
print('Context Window Size: {window_size}'.format(window_size=args.window))
print('Save Random Walks with Node2Vec Model: {keep_walks}'.format(keep_walks=args.keep_walks))
print('Save Gensim Node2Vec Model: {save_model}'.format(save_model=args.save_model))
print('Embedding output: {write_loc}'.format(write_loc=args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb'))
print('\n#######################################################################\n')
print('\n#### STEP 1: Convert Edge List to CSR Graph ####')
graph = cg.read_edgelist(args.edgelist, sep=' ', header=None)
print('\n#### STEP 2: Fit Embedding Model to Graph ####')
g2v = nodevectors.Node2Vec(n_components=int(args.dim),
walklen=int(args.walklen),
epochs=int(args.walknum),
return_weight=float(args.return_weight),
neighbor_weight=float(args.explore_weight),
threads=int(args.threads),
keep_walks=args.keep_walks,
verbose=True,
w2vparams={'window': int(args.window), 'iter': 10})
g2v.fit(graph)
print('\n#### STEP 3: Save Model Output and Embeddings ####')
# save embeddings (gensim.KeyedVector format)
g2v.save_vectors(args.edgelist.split('.')[0] + '_node2vec_Embeddings.emb')
if args.save_model:
# save node2vec model -- uses a lot of memory and takes a very long time to run on large graphs
g2v.save(args.edgelist.split('.')[0] + '_node2vec_Model.pkl')
# g2v = Node2vec.load(args.input.split('.')[0] + '_node2vec_Model.pkl')
if __name__ == '__main__':
main()
@callahantiff
Copy link
Author

callahantiff commented Dec 2, 2020

Complete Before Running node2vec

Install needed libraries

user:~$ pip install argparse csrgraph nodevectors tqdm

STEP 1

Map Identifiers-based subject, predicate, and object entities to integers.

import json
from tqdm import tqdm

# read in original data file 
file_loc = '<<your filepath here>>'
with open(file_loc, 'r') as f_in:
    kg_data = set(tuple(x.split('\t')) for x in f_in.read().splitlines())
f_in.close()

# set output filenames
output_ints_location = '<<your filepath here>>'
output_ints_map_location = '<<your filepath here>>'

# map identifiers to integers
entity_map = {}
entity_counter = 0
graph_len = len(kg_data)

ints = open(output_ints_location, 'w', encoding='utf-8')
ints.write('subject' + '\t' + 'predicate' + '\t' + 'object' + '\n')

for s, p, o in tqdm(kg_data):
    subj, pred, obj = s, p, o
    if subj not in entity_map: entity_counter += 1; entity_map[subj] = entity_counter
    if pred not in entity_map: entity_counter += 1; entity_map[pred] = entity_counter
    if obj not in entity_map: entity_counter += 1; entity_map[obj] = entity_counter
    ints.write('%d' % entity_map[subj] + '\t' + '%d' % entity_map[pred] + '\t' + '%d' % entity_map[obj] + '\n')
ints.close()

# write out the identifier-integer map
with open(output_ints_map_location, 'w') as file_name:
    json.dump(entity_map, file_name)

STEP 2

Node2vec requires the graph to be formatted in a specific way (nodea nodeb) rather than nodea edge nodeb like most programs output. The code chunk below reads in a text file containing edges lists formatted as triples and outputs a modified edge list with the edges removed (i.e. nodea nodeb).

# read original data file and convert 3 columns to 2 (just subjects and objects)
inputs_ints_file_loc = '<<your filepath here>>'

with open(inputs_ints_file_loc) as f_in:
    kg_data = [x.split('\t')[0::2] for x in f_in.read().splitlines()]

f_in.close()

# write out cleaned file
file_out = '<<>> '

with open(file_out, 'w') as f_out:
    for x in kg_data[1:]:
        f_out.write(x[0] + ' ' + x[1] + '\n')

f_out.close()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment