Yue Zhang yjzhang

## k_means.py
# Online k-means algorithm
# see http://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/C/sk_means.htm

import numpy as np

def k_means(data, k, threshhold=2):
    """
    Does k-means clustering of the data.

    Args:

## download-medline-data.sh
#!/bin/bash

# downloads all MEDLINE/Pubmed citations in the annual baseline.

for i in $(seq 1 972); do
    fname="1"
    if ((i < 10)); then
        fname="ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed19n000$i.xml.gz"
    elif ((i < 100)); then
        fname="ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed19n00$i.xml.gz"

## neo4j_cypher_cheatsheet.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                yjzhang
                / neo4j_cypher_cheatsheet.md
            
            
              Created
              October 19, 2022 01:23
                — forked from DaniSancas/neo4j_cypher_cheatsheet.md
            
              
                Neo4j's Cypher queries cheatsheet
              
          
    Neo4j Tutorial

Fundamentals

Store any kind of data using the following graph concepts:

Node: Graph data records
Relationship: Connect nodes (has direction and a type)
Property: Stores data in key-value pair in nodes and relationships
Label: Groups nodes and relationships (optional)


## pubmed_abstracts_downloader.py
# https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
import requests
from selectolax.parser import HTMLParser

base_url = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/'

r = requests.get(base_url)
tree = HTMLParser(r.content)

for node in tree.css('a'):

## node2vec.py
import functools

#from numba import jit
import numpy as np
from scipy import sparse


def random_walks(adj_list, r, l, p=1, q=1, verbose=False):
    """
    Biased random walk starting from node i.

## find_all_shortest_paths.py
def find_all_shortest_paths(dic_node, n1, n2, step_threshold):
    return_paths = []
    visit_queue = [[n1]]
    # this can be updated to get all shortest paths for all visited nodes
    # for the shortest path from N1 to N2, all intermediate paths are also shortest paths between their respective nodes.
    visited_nodes_prev = set()
    visited_nodes = set()
    cur_distance = 0
    while len(visit_queue):
        cur_path = visit_queue.pop(0)

## pagerank_sparse.py
# Using a sparse matrix imported from...

import numpy as np
from scipy import sparse, io


def pagerank(adjacency, probs=None, n_iters=20, resid=0.85, modify_matrix=True):
    """
    Args:
        adjacency - sparse matrix

## filter.py
import subprocess

import numpy as np
from scipy import sparse, io

threshold = 1000
folders = ['M7_5', 'M8_3', 'X5.3.4', 'X5_2', 'X6.1']

for f in folders:
    print(f)

## readability.py
#!/usr/bin/env python3

import sys
import textstat

path = 'scripted_test_outputs/000_output.txt'

if len(sys.argv) > 1:
    path = sys.argv[1]

## sort_submissions.py
import os
import subprocess

filenames = os.listdir('.')
for filename in filenames:
    name = filename.split('_')[0]
    if not os.path.exists(name):
        os.makedirs(name)
    os.rename(filename, os.path.join(name, filename))
    if filename.endswith('zip'):
	# Online k-means algorithm
	# see http://www.cs.princeton.edu/courses/archive/fall08/cos436/Duda/C/sk_means.htm

	import numpy as np

	def k_means(data, k, threshhold=2):
	"""
	Does k-means clustering of the data.

	Args:
	#!/bin/bash

	# downloads all MEDLINE/Pubmed citations in the annual baseline.

	for i in $(seq 1 972); do
	fname="1"
	if ((i < 10)); then
	fname="ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed19n000$i.xml.gz"
	elif ((i < 100)); then
	fname="ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed19n00$i.xml.gz"
	# https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/
	import requests
	from selectolax.parser import HTMLParser

	base_url = 'https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/'

	r = requests.get(base_url)
	tree = HTMLParser(r.content)

	for node in tree.css('a'):
	import functools

	#from numba import jit
	import numpy as np
	from scipy import sparse


	def random_walks(adj_list, r, l, p=1, q=1, verbose=False):
	"""
	Biased random walk starting from node i.
	def find_all_shortest_paths(dic_node, n1, n2, step_threshold):
	return_paths = []
	visit_queue = [[n1]]
	# this can be updated to get all shortest paths for all visited nodes
	# for the shortest path from N1 to N2, all intermediate paths are also shortest paths between their respective nodes.
	visited_nodes_prev = set()
	visited_nodes = set()
	cur_distance = 0
	while len(visit_queue):
	cur_path = visit_queue.pop(0)
	# Using a sparse matrix imported from...

	import numpy as np
	from scipy import sparse, io


	def pagerank(adjacency, probs=None, n_iters=20, resid=0.85, modify_matrix=True):
	"""
	Args:
	adjacency - sparse matrix
	import subprocess

	import numpy as np
	from scipy import sparse, io

	threshold = 1000
	folders = ['M7_5', 'M8_3', 'X5.3.4', 'X5_2', 'X6.1']

	for f in folders:
	print(f)
	#!/usr/bin/env python3

	import sys
	import textstat

	path = 'scripted_test_outputs/000_output.txt'

	if len(sys.argv) > 1:
	path = sys.argv[1]
	import os
	import subprocess

	filenames = os.listdir('.')
	for filename in filenames:
	name = filename.split('_')[0]
	if not os.path.exists(name):
	os.makedirs(name)
	os.rename(filename, os.path.join(name, filename))
	if filename.endswith('zip'):