Skip to content

Instantly share code, notes, and snippets.

@tuxdna
Last active March 7, 2018 10:11
Show Gist options
  • Save tuxdna/4930d9f6c30f2cca8e2a77aeab77b8b7 to your computer and use it in GitHub Desktop.
Save tuxdna/4930d9f6c30f2cca8e2a77aeab77b8b7 to your computer and use it in GitHub Desktop.
Sync graph data

How to run?

Create config.py from config.py.template and fill in your AUTH token, and the target URL.

Then invoke the grpah sync:

$ python3 perform_graph_sync.py package input-file.json maven

Invoking url: http://.../api/v1/graphsync/sync_all/maven/HTTPClient%3AHTTPClient
COMPLETED: maven/HTTPClient:HTTPClient
Invoking url: http://.../api/v1/graphsync/sync_all/maven/abbot%3Aabbot
COMPLETED: maven/abbot:abbot
Invoking url: http://.../api/v1/graphsync/sync_all/maven/abbot%3Acostello
COMPLETED: maven/abbot:costello
... MORE OUTPUT ...

The input-file.json file contains data in following format:

{
    "data": [
        {
            "ecosystem": "go", 
            "name": "github.com/01org/cc-oci-runtime/proxy/api", 
            "version": "c0d481bb7e72e27394c708df5ed970338cd9392f"
        }, 
        {
            "ecosystem": "go", 
            "name": "github.com/01org/ciao", 
            "version": "b4a2d33d7c30a4ecf0af1de4729c2125c1bfadaa"
        }, 



}

Just terminate and restart as required.

AUTH_TOKEN = '***************************'
BASE_URL = 'http://bayesian-jobs-***.openshiftapps.com/api/v1/graphsync/'
AUTH_TOKEN="d8d26aece35a4a892e28dfde89a88ecfd8d86893"
OUTPUT_FILE=pending_output-4.json
#URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/npm'
URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/npm'
while true
do
mv $OUTPUT_FILE /tmp/$OUTPUT_FILE
curl -o $OUTPUT_FILE -X GET --header 'Accept: application/problem+json' --header "auth-token: $AUTH_TOKEN" $URL
python3 perform_graph_sync.py package pending_output-4.json npm 2>&1 | tee -a pass-4-output.log
sleep 2
done
import requests
from urllib.parse import urlencode, urljoin
import config
import json
import concurrent.futures
import traceback
import sys
import shelve
import threading
# import asyncio
def all_ecosystems(ecosystems):
for ecosystem in ecosystems:
url = urljoin(base_url, "pending/" + ecosystem)
print("Invoking url: %s" % url)
response = requests.get(url, headers=headers)
response.raise_for_status()
data = response.json()
packages = data["data"]
pending_count = len(packages)
print("Pending %s packages = %s" % (ecosystem, pending_count))
if pending_count <= 0:
continue
# invoke graph_sync for this ecosystem
for package in packages:
do_sync(package)
headers = {'Accept': 'application/json', 'auth-token': config.AUTH_TOKEN}
base_url = config.BASE_URL
db_lock = threading.Lock()
db = shelve.open("package_shelve.db")
def my_url_encode(s):
return urlencode({"": s})[1:]
def do_sync_version(package):
"""
Example:
package = {'ecosystem': 'maven', 'name': 'junit:junit', 'version': '3.8'}
"""
ecosystem = my_url_encode(package["ecosystem"])
pname = my_url_encode(package["name"])
pversion = my_url_encode(package["version"])
url = urljoin(base_url, "sync_all/%s/%s/%s" % (ecosystem, pname, pversion))
# url = base_url + "sync_all/%s/%s" % (ecosystem, pname)
print("Invoking url: %s" % url)
response = requests.get(url, headers=headers)
try:
response.raise_for_status()
except Exception:
traceback.print_exc()
else:
key = "%s/%s/%s" % (package["ecosystem"], package["name"], package["version"])
print("COMPLETED: %s" % key)
with db_lock:
db[key] = 1
db.sync()
def do_sync(package):
"""
Example:
package = {'ecosystem': 'maven', 'name': 'junit:junit', 'version': '3.8'}
"""
ecosystem = my_url_encode(package["ecosystem"])
pname = my_url_encode(package["name"])
url = base_url + "sync_all/%s/%s" % (ecosystem, pname)
print("Invoking url: %s" % url)
response = requests.get(url, headers=headers)
try:
response.raise_for_status()
except Exception:
traceback.print_exc()
else:
key = "%s/%s" % (package["ecosystem"], package["name"])
print("COMPLETED: %s" % key)
with db_lock:
db[key] = 1
db.sync()
granularity = "package"
if len(sys.argv) >= 2:
granularity = sys.argv[1]
input_file = "maven-data-new.json"
if len(sys.argv) >= 3:
input_file = sys.argv[2]
ecosystems = set(["nuget", "maven", "npm", "pypi", "go"])
if len(sys.argv) >= 4:
ecosystems = set(sys.argv[3:])
fp = open(input_file, "r")
data = json.load(fp)
packages = data["data"]
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
# Start the load operations and mark each future with its URL
future_to_package_map = {}
invoked_list = set()
if granularity == "version":
for package in packages:
e, p, v = (package["ecosystem"], package["name"], package["version"])
key = "%s/%s/%s" % (package["ecosystem"], package["name"], package["version"])
if package["ecosystem"] in ecosystems and key not in db:
f = executor.submit(do_sync_version, package)
invoked_list.add((p, v))
future_to_package_map[f] = package
elif granularity == "package":
for package in packages:
e, p, v = (package["ecosystem"], package["name"], package["version"])
key = "%s/%s" % (package["ecosystem"], package["name"])
if package["ecosystem"] in ecosystems and key not in db:
f = executor.submit(do_sync, package)
invoked_list.add(p)
future_to_package_map[f] = package
for future in concurrent.futures.as_completed(future_to_package_map):
pkg = future_to_package_map[future]
try:
data = future.result()
# do json processing here
except Exception as exc:
print("FAILURE: %s" % package)
print('%r generated an exception: %s' % (package, exc))
else:
# print('%r page is %d bytes' % (package, len(data)))
print("SUCCESS: %s" % package)
import shelve
db = shelve.open("package_shelve.db")
f = open("sorted.list")
for l in f.readlines():
if "COMPLETED: " in l:
key = l[11:].strip()
# if " " not in key: # skip entries with spaces
db[key] = 1
print("Total entries: ", len(db.keys()))
db.sync()
AUTH_TOKEN="d8d26aece35a4a892e28dfde89a88ecfd8d86893"
OUTPUT_FILE=pending_output-4.json
#URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/maven'
URL='http://bayesian-jobs-bayesian-production.09b5.dsaas.openshiftapps.com/api/v1/graphsync/pending/maven'
while true
do
mv $OUTPUT_FILE /tmp/$OUTPUT_FILE
curl -o $OUTPUT_FILE -X GET --header 'Accept: application/problem+json' --header "auth-token: $AUTH_TOKEN" $URL
python3 perform_graph_sync.py package pending_output-4.json maven 2>&1 | tee -a pass-4-output.log
sleep 2
done
@tuxdna
Copy link
Author

tuxdna commented Feb 14, 2018

Following error is encountered multiple times:


Traceback (most recent call last):
  File "perform_graph_sync.py", line 126, in <module>
    for future in concurrent.futures.as_completed(future_to_package_map):
  File "/usr/lib64/python3.4/concurrent/futures/_base.py", line 195, in as_completed
    with _AcquireFutures(fs):
  File "/usr/lib64/python3.4/concurrent/futures/_base.py", line 146, in __enter__
    future._condition.acquire()
AttributeError: '_io.TextIOWrapper' object has no attribute '_condition'

@tuxdna
Copy link
Author

tuxdna commented Feb 14, 2018

Remain packages

n [9]: remainpackages = set([ x["name"] for x in data["data"]])

In [10]: len(remainpackages)
Out[10]: 66447

In [11]: len(remainpackages) / 131460.0
Out[11]: 0.5054541305340028

In [12]: 100 * len(remainpackages) / 131460.0
Out[12]: 50.545413053400274

Remain package versions:


In [1]: import json

In [2]: data = json.load(open("maven-data-new.json"))

In [3]: only_maven = [x for x in data["data"] if x["ecosystem"] == "maven"]

In [4]: len(only_maven)
Out[4]: 435837

In [5]: 226752 / 435837 
Out[5]: 0.5202678983197847


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment