Skip to content

Instantly share code, notes, and snippets.

View cj2001's full-sized avatar
💭
Working remote

C.J. Sullivan cj2001

💭
Working remote
View GitHub Profile
@cj2001
cj2001 / grapher.py
Created May 9, 2019 22:38
Dask for Michal
import json
import argparse
import yaml
from typing import Any, Dict
import networkx as nx
import pandas as pd
from tqdm import tqdm
import pygraphviz
import graphviz
@cj2001
cj2001 / load_hero_edges.cypher
Created October 30, 2020 18:11
Loading heros edge list CSV data into Neo4j
LOAD CSV WITH HEADERS FROM "file:///heros_edge_list.csv" AS row
MATCH (h1:hero {hero: row.hero})
MATCH (c1:comic {comic: row.comic})
CALL apoc.create.relationship(h1, c1) YIELD rel
REMOVE rel.noOp;
@cj2001
cj2001 / neo4j_start_docker_container.sh
Last active January 26, 2021 17:56
Starter Docker container for Medium post
docker run -p 7474:7474 -p 7687:7687 \
--volume=$HOME/graph_data/data:/data \
--volume=$HOME/graph_data/gameofthrones/data:/var/lib/neo4j/import \
--volume=$HOME/graph_data/plugins:/var/lib/neo4j/plugins \
--volume=$home/neo4j/logs:/var/lib/neo4j/logs \
--env NEO4J_dbms_security_procedures_unrestricted=gds.\\\*,apoc.\\\* \
--env apoc.import.file.enabled=true \
--env NEO4J_AUTH=neo4j/1234 \
neo4j:latest
WITH "file:///got-s1-nodes.csv" AS uri
LOAD CSV WITH HEADERS FROM uri AS row
MERGE (c:Character {id:row.Id})
SET c.name = row.Label
@cj2001
cj2001 / load_arxiv_data.py
Last active February 9, 2021 23:31
Load arXiv data
file = "./arxiv-metadata-oai-snapshot.json"
metadata = []
lines = 100000 # 100k for testing
with open(file, 'r') as f:
for line in tqdm(f):
metadata.append(json.loads(line))
@cj2001
cj2001 / clean_author_and_category_lists.py
Created February 9, 2021 23:36
Clean arXiv author and category lists
def get_author_list(line):
# Cleans author dataframe column, creating a list of authors in the row.
return [e[1] + ' ' + e[0] for e in line]
def get_category_list(line):
# Cleans category dataframe column, creating a list of categories in the row.
return list(line.split(" "))
@cj2001
cj2001 / neo4j_python_connection_class.py
Created February 9, 2021 23:43
Neo4j Python connection class
class Neo4jConnection:
def __init__(self, uri, user, pwd):
self.__uri = uri
self.__user = user
self.__pwd = pwd
self.__driver = None
try:
self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
except Exception as e:
@cj2001
cj2001 / create_arxiv_constraints.py
Created February 9, 2021 23:44
Create arXiv constraints
conn.query('CREATE CONSTRAINT papers IF NOT EXISTS ON (p:Paper) ASSERT p.id IS UNIQUE')
conn.query('CREATE CONSTRAINT authors IF NOT EXISTS ON (a:Author) ASSERT a.name IS UNIQUE')
conn.query('CREATE CONSTRAINT categories IF NOT EXISTS ON (c:Category) ASSERT c.category IS UNIQUE')
@cj2001
cj2001 / paper_nodes_and_edges.py
Created February 9, 2021 23:51
Add arXiv paper nodes and all edges
def add_papers(rows, batch_size=5000):
# Adds paper nodes and (:Author)--(:Paper) and
# (:Paper)--(:Category) relationships to the Neo4j graph as a
# batch job.
query = '''
UNWIND $rows as row
MERGE (p:Paper {id:row.id}) ON CREATE SET p.title = row.title
// connect categories
@cj2001
cj2001 / add_category_and_author_nodes.py
Created February 9, 2021 23:53
Add category and author nodes
categories = pd.DataFrame(df[['category_list']])
categories.rename(columns={'category_list':'category'},
inplace=True)
categories = categories.explode('category') \
.drop_duplicates(subset=['category'])
authors = pd.DataFrame(df[['cleaned_authors_list']])
authors.rename(columns={'cleaned_authors_list':'author'},
inplace=True)
authors=authors.explode('author').drop_duplicates(subset=['author'])