Skip to content

Instantly share code, notes, and snippets.

@yjzhang
yjzhang / k_means.py
Last active February 9, 2024 21:04
basic python implementation of k-means and online k-means clustering
# Online k-means algorithm
# see http://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/C/sk_means.htm
import numpy as np
def k_means(data, k, threshhold=2):
"""
Does k-means clustering of the data.
Args:
@yjzhang
yjzhang / download-medline-data.sh
Created February 1, 2019 23:01
Download all medline citations from ncbi
#!/bin/bash
# downloads all MEDLINE/Pubmed citations in the annual baseline.
for i in $(seq 1 972); do
fname="1"
if ((i < 10)); then
fname="ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed19n000$i.xml.gz"
elif ((i < 100)); then
fname="ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed19n00$i.xml.gz"
@yjzhang
yjzhang / neo4j_cypher_cheatsheet.md
Created October 19, 2022 01:23 — forked from DaniSancas/neo4j_cypher_cheatsheet.md
Neo4j's Cypher queries cheatsheet

Neo4j Tutorial

Fundamentals

Store any kind of data using the following graph concepts:

  • Node: Graph data records
  • Relationship: Connect nodes (has direction and a type)
  • Property: Stores data in key-value pair in nodes and relationships
  • Label: Groups nodes and relationships (optional)
# https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
import requests
from selectolax.parser import HTMLParser
base_url = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/'
r = requests.get(base_url)
tree = HTMLParser(r.content)
for node in tree.css('a'):
@yjzhang
yjzhang / node2vec.py
Created September 29, 2022 21:26
very basic node2vec implementation
import functools
#from numba import jit
import numpy as np
from scipy import sparse
def random_walks(adj_list, r, l, p=1, q=1, verbose=False):
"""
Biased random walk starting from node i.
@yjzhang
yjzhang / find_all_shortest_paths.py
Created September 29, 2022 21:23
Find all shortest paths between two nodes in a graph using BFS.
def find_all_shortest_paths(dic_node, n1, n2, step_threshold):
return_paths = []
visit_queue = [[n1]]
# this can be updated to get all shortest paths for all visited nodes
# for the shortest path from N1 to N2, all intermediate paths are also shortest paths between their respective nodes.
visited_nodes_prev = set()
visited_nodes = set()
cur_distance = 0
while len(visit_queue):
cur_path = visit_queue.pop(0)
@yjzhang
yjzhang / pagerank_sparse.py
Created September 15, 2022 02:02
Implementation of PageRank in Python using sparse matrices
# Using a sparse matrix imported from...
import numpy as np
from scipy import sparse, io
def pagerank(adjacency, probs=None, n_iters=20, resid=0.85, modify_matrix=True):
"""
Args:
adjacency - sparse matrix
import subprocess
import numpy as np
from scipy import sparse, io
threshold = 1000
folders = ['M7_5', 'M8_3', 'X5.3.4', 'X5_2', 'X6.1']
for f in folders:
print(f)
#!/usr/bin/env python3
import sys
import textstat
path = 'scripted_test_outputs/000_output.txt'
if len(sys.argv) > 1:
path = sys.argv[1]
import os
import subprocess
filenames = os.listdir('.')
for filename in filenames:
name = filename.split('_')[0]
if not os.path.exists(name):
os.makedirs(name)
os.rename(filename, os.path.join(name, filename))
if filename.endswith('zip'):