tuxdna/README.md

## README.md

      
    Raw
  

              README.md
            
          
    How to run?

Create config.py from config.py.template and fill in your AUTH token, and the target URL.
Then invoke the grpah sync:
$ python3 perform_graph_sync.py package input-file.json maven

Invoking url: http://.../api/v1/graphsync/sync_all/maven/HTTPClient%3AHTTPClient
COMPLETED: maven/HTTPClient:HTTPClient
Invoking url: http://.../api/v1/graphsync/sync_all/maven/abbot%3Aabbot
COMPLETED: maven/abbot:abbot
Invoking url: http://.../api/v1/graphsync/sync_all/maven/abbot%3Acostello
COMPLETED: maven/abbot:costello
... MORE OUTPUT ...


The input-file.json file contains data in following format:
{
    "data": [
        {
            "ecosystem": "go", 
            "name": "github.com/01org/cc-oci-runtime/proxy/api", 
            "version": "c0d481bb7e72e27394c708df5ed970338cd9392f"
        }, 
        {
            "ecosystem": "go", 
            "name": "github.com/01org/ciao", 
            "version": "b4a2d33d7c30a4ecf0af1de4729c2125c1bfadaa"
        }, 


}

Just terminate and restart as required.

  
## config.py.template
AUTH_TOKEN = '***************************'
BASE_URL = 'http://bayesian-jobs-***.openshiftapps.com/api/v1/graphsync/'

## npm-run.sh
AUTH_TOKEN="d8d26aece35a4a892e28dfde89a88ecfd8d86893"
OUTPUT_FILE=pending_output-4.json
#URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/npm'
URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/npm'
while true
do
 mv $OUTPUT_FILE /tmp/$OUTPUT_FILE
 curl -o $OUTPUT_FILE -X GET --header 'Accept: application/problem+json' --header "auth-token: $AUTH_TOKEN" $URL
 python3 perform_graph_sync.py package pending_output-4.json npm 2>&1 | tee -a pass-4-output.log
 sleep 2
done

## perform_graph_sync.py
import requests
from urllib.parse import urlencode, urljoin
import config
import json
import concurrent.futures
import traceback
import sys
import shelve
import threading


# import asyncio

def all_ecosystems(ecosystems):
    for ecosystem in ecosystems:
        url = urljoin(base_url, "pending/" + ecosystem)
        print("Invoking url: %s" % url)
        response = requests.get(url, headers=headers)

        response.raise_for_status()

        data = response.json()
        packages = data["data"]
        pending_count = len(packages)
        print("Pending %s packages = %s" % (ecosystem, pending_count))
        if pending_count <= 0:
            continue

        # invoke graph_sync for this ecosystem
        for package in packages:
            do_sync(package)


headers = {'Accept': 'application/json', 'auth-token': config.AUTH_TOKEN}
base_url = config.BASE_URL

db_lock = threading.Lock()

db = shelve.open("package_shelve.db")


def my_url_encode(s):
    return urlencode({"": s})[1:]


def do_sync_version(package):
    """
    Example:
    package = {'ecosystem': 'maven', 'name': 'junit:junit', 'version': '3.8'}
    """
    ecosystem = my_url_encode(package["ecosystem"])
    pname = my_url_encode(package["name"])
    pversion = my_url_encode(package["version"])

    url = urljoin(base_url, "sync_all/%s/%s/%s" % (ecosystem, pname, pversion))
    # url = base_url + "sync_all/%s/%s" % (ecosystem, pname)

    print("Invoking url: %s" % url)
    response = requests.get(url, headers=headers)
    try:
        response.raise_for_status()
    except Exception:
        traceback.print_exc()
    else:
        key = "%s/%s/%s" % (package["ecosystem"], package["name"], package["version"])
        print("COMPLETED: %s" % key)
        with db_lock:
            db[key] = 1
            db.sync()


def do_sync(package):
    """
    Example:
    package = {'ecosystem': 'maven', 'name': 'junit:junit', 'version': '3.8'}
    """
    ecosystem = my_url_encode(package["ecosystem"])
    pname = my_url_encode(package["name"])

    url = base_url + "sync_all/%s/%s" % (ecosystem, pname)

    print("Invoking url: %s" % url)
    response = requests.get(url, headers=headers)
    try:
        response.raise_for_status()
    except Exception:
        traceback.print_exc()
    else:
        key = "%s/%s" % (package["ecosystem"], package["name"])
        print("COMPLETED: %s" % key)
        with db_lock:
            db[key] = 1
            db.sync()


granularity = "package"
if len(sys.argv) >= 2:
    granularity = sys.argv[1]


input_file = "maven-data-new.json"
if len(sys.argv) >= 3:
    input_file = sys.argv[2]


ecosystems = set(["nuget", "maven", "npm", "pypi", "go"])
if len(sys.argv) >= 4:
    ecosystems = set(sys.argv[3:])


fp = open(input_file, "r")
data = json.load(fp)
packages = data["data"]


# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    # Start the load operations and mark each future with its URL
    future_to_package_map = {}

    invoked_list = set()

    if granularity == "version":
        for package in packages:
            e, p, v = (package["ecosystem"], package["name"], package["version"])
            key = "%s/%s/%s" % (package["ecosystem"], package["name"], package["version"])
            if package["ecosystem"] in ecosystems and key not in db:
                f = executor.submit(do_sync_version, package)
                invoked_list.add((p, v))
                future_to_package_map[f] = package

    elif granularity == "package":
        for package in packages:
            e, p, v = (package["ecosystem"], package["name"], package["version"])
            key = "%s/%s" % (package["ecosystem"], package["name"])
            if package["ecosystem"] in ecosystems and key not in db:
                f = executor.submit(do_sync, package)
                invoked_list.add(p)
                future_to_package_map[f] = package

    for future in concurrent.futures.as_completed(future_to_package_map):
        pkg = future_to_package_map[future]
        try:
            data = future.result()
            # do json processing here
        except Exception as exc:
            print("FAILURE: %s" % package)
            print('%r generated an exception: %s' % (package, exc))
        else:
            # print('%r page is %d bytes' % (package, len(data)))
            print("SUCCESS: %s" % package)

## populatedb.py
import shelve
db = shelve.open("package_shelve.db")
f = open("sorted.list")

for l in f.readlines():
    if "COMPLETED: " in l:
        key = l[11:].strip()
        # if " " not in key:  # skip entries with spaces
        db[key] = 1

print("Total entries: ", len(db.keys()))

db.sync()

## run.sh
AUTH_TOKEN="d8d26aece35a4a892e28dfde89a88ecfd8d86893"
OUTPUT_FILE=pending_output-4.json
#URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/maven'
URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/maven'
while true
do
 mv $OUTPUT_FILE /tmp/$OUTPUT_FILE
 curl -o $OUTPUT_FILE -X GET --header 'Accept: application/problem+json' --header "auth-token: $AUTH_TOKEN" $URL
 python3 perform_graph_sync.py package pending_output-4.json maven 2>&1 | tee -a pass-4-output.log
 sleep 2
done
	AUTH_TOKEN = '***************************'
	BASE_URL = 'http://bayesian-jobs-***.openshiftapps.com/api/v1/graphsync/'
	AUTH_TOKEN="d8d26aece35a4a892e28dfde89a88ecfd8d86893"
	OUTPUT_FILE=pending_output-4.json
	#URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/npm'
	URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/npm'
	while true
	do
	mv $OUTPUT_FILE /tmp/$OUTPUT_FILE
	curl -o $OUTPUT_FILE -X GET --header 'Accept: application/problem+json' --header "auth-token: $AUTH_TOKEN" $URL
	python3 perform_graph_sync.py package pending_output-4.json npm 2>&1 \| tee -a pass-4-output.log
	sleep 2
	done
	import requests
	from urllib.parse import urlencode, urljoin
	import config
	import json
	import concurrent.futures
	import traceback
	import sys
	import shelve
	import threading


	# import asyncio

	def all_ecosystems(ecosystems):
	for ecosystem in ecosystems:
	url = urljoin(base_url, "pending/" + ecosystem)
	print("Invoking url: %s" % url)
	response = requests.get(url, headers=headers)

	response.raise_for_status()

	data = response.json()
	packages = data["data"]
	pending_count = len(packages)
	print("Pending %s packages = %s" % (ecosystem, pending_count))
	if pending_count <= 0:
	continue

	# invoke graph_sync for this ecosystem
	for package in packages:
	do_sync(package)


	headers = {'Accept': 'application/json', 'auth-token': config.AUTH_TOKEN}
	base_url = config.BASE_URL

	db_lock = threading.Lock()

	db = shelve.open("package_shelve.db")


	def my_url_encode(s):
	return urlencode({"": s})[1:]


	def do_sync_version(package):
	"""
	Example:
	package = {'ecosystem': 'maven', 'name': 'junit:junit', 'version': '3.8'}
	"""
	ecosystem = my_url_encode(package["ecosystem"])
	pname = my_url_encode(package["name"])
	pversion = my_url_encode(package["version"])

	url = urljoin(base_url, "sync_all/%s/%s/%s" % (ecosystem, pname, pversion))
	# url = base_url + "sync_all/%s/%s" % (ecosystem, pname)

	print("Invoking url: %s" % url)
	response = requests.get(url, headers=headers)
	try:
	response.raise_for_status()
	except Exception:
	traceback.print_exc()
	else:
	key = "%s/%s/%s" % (package["ecosystem"], package["name"], package["version"])
	print("COMPLETED: %s" % key)
	with db_lock:
	db[key] = 1
	db.sync()


	def do_sync(package):
	"""
	Example:
	package = {'ecosystem': 'maven', 'name': 'junit:junit', 'version': '3.8'}
	"""
	ecosystem = my_url_encode(package["ecosystem"])
	pname = my_url_encode(package["name"])

	url = base_url + "sync_all/%s/%s" % (ecosystem, pname)

	print("Invoking url: %s" % url)
	response = requests.get(url, headers=headers)
	try:
	response.raise_for_status()
	except Exception:
	traceback.print_exc()
	else:
	key = "%s/%s" % (package["ecosystem"], package["name"])
	print("COMPLETED: %s" % key)
	with db_lock:
	db[key] = 1
	db.sync()


	granularity = "package"
	if len(sys.argv) >= 2:
	granularity = sys.argv[1]


	input_file = "maven-data-new.json"
	if len(sys.argv) >= 3:
	input_file = sys.argv[2]


	ecosystems = set(["nuget", "maven", "npm", "pypi", "go"])
	if len(sys.argv) >= 4:
	ecosystems = set(sys.argv[3:])


	fp = open(input_file, "r")
	data = json.load(fp)
	packages = data["data"]


	# We can use a with statement to ensure threads are cleaned up promptly
	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
	# Start the load operations and mark each future with its URL
	future_to_package_map = {}

	invoked_list = set()

	if granularity == "version":
	for package in packages:
	e, p, v = (package["ecosystem"], package["name"], package["version"])
	key = "%s/%s/%s" % (package["ecosystem"], package["name"], package["version"])
	if package["ecosystem"] in ecosystems and key not in db:
	f = executor.submit(do_sync_version, package)
	invoked_list.add((p, v))
	future_to_package_map[f] = package

	elif granularity == "package":
	for package in packages:
	e, p, v = (package["ecosystem"], package["name"], package["version"])
	key = "%s/%s" % (package["ecosystem"], package["name"])
	if package["ecosystem"] in ecosystems and key not in db:
	f = executor.submit(do_sync, package)
	invoked_list.add(p)
	future_to_package_map[f] = package

	for future in concurrent.futures.as_completed(future_to_package_map):
	pkg = future_to_package_map[future]
	try:
	data = future.result()
	# do json processing here
	except Exception as exc:
	print("FAILURE: %s" % package)
	print('%r generated an exception: %s' % (package, exc))
	else:
	# print('%r page is %d bytes' % (package, len(data)))
	print("SUCCESS: %s" % package)
	import shelve
	db = shelve.open("package_shelve.db")
	f = open("sorted.list")

	for l in f.readlines():
	if "COMPLETED: " in l:
	key = l[11:].strip()
	# if " " not in key: # skip entries with spaces
	db[key] = 1

	print("Total entries: ", len(db.keys()))

	db.sync()