Skip to content

Instantly share code, notes, and snippets.

@timfel
Created August 25, 2022 18:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timfel/3fc3b1f339917ff48b69289e79a39f8d to your computer and use it in GitHub Desktop.
Save timfel/3fc3b1f339917ff48b69289e79a39f8d to your computer and use it in GitHub Desktop.
from collections import namedtuple
import os
import pprint
import re
import requests
import shutil
import subprocess
import tarfile
import traceback
import zipfile
session = requests.Session()
EXTRA_URLS = {
# source urls for some projects that have no sdist in PyPI
"torch": "https://github.com/pytorch/pytorch/archive/refs/tags/v1.10.2.zip",
"tensorflow": "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.8.0.zip",
"caffe": "https://github.com/BVLC/caffe/archive/refs/heads/master.zip",
"torchvision": "https://github.com/pytorch/vision/archive/refs/tags/v0.11.3.zip",
"keras": "https://github.com/keras-team/keras/archive/refs/tags/v2.8.0.zip",
"cv2": "https://github.com/opencv/opencv-python/archive/refs/heads/3.4.zip",
"nimbusml": "https://github.com/microsoft/NimbusML/archive/refs/heads/master.zip",
"mxnet": "https://github.com/apache/incubator-mxnet/archive/refs/tags/v2.0.0.beta0.rc1.zip",
"onnxruntime": "https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.10.0.zip",
"python-util": "https://github.com/MisterL2/python-util/archive/refs/heads/master.zip",
}
DS_IMPORTANT_PACKAGES = list(
dict.fromkeys(
[
# top 10 in Data Science Through the Looking Glass
"numpy",
"matplotlib",
"pandas",
"scikit-learn",
"scipy",
"seaborn",
"tensorflow",
# "pylab", # pylab is just matplotlib.pylab
"requests",
"statsmodels",
# top 10 rank change in Data Science Through the Looking Glass
"torch",
"keras",
"xgboost",
"Pillow", # called PIL in the paper
"python-util",
"cv2", # really python-opencv
"tqdm",
"sqlalchemy",
"gensim",
"tensorflow",
# top 10 pct change in Data Science Through the Looking Glass
"pandas",
"matplotlib",
"scikit-learn",
"seaborn",
"keras",
"torch",
"numpy",
"tensorflow",
"Pillow", # called PIL in the paper
"cv2", # really python-opencv
# top 5 deep learning imports in Data Science Through the Looking Glass
"tensorflow",
"keras",
"theano",
"caffe",
"torch",
# top 10 imports in Data Science Through the Looking Glass
"scikit-learn",
"numpy",
"matplotlib",
"pandas",
"scipy",
"keras",
"seaborn",
"tensorflow",
"nltk",
"statsmodels",
# extras from correlation
"bs4",
"torchvision",
# "selenium", # pure Python
# release analysis packages from Data Science Through the Looking Glass
"keras",
"lasagne",
"matplotlib",
"nolearn",
"numpy",
"pandas",
"scikit-learn",
"scipy",
"seaborn",
"nimbusml",
"mxnet",
# extra packages
"category-encoders",
"dask",
"imbalanced-learn",
"lightgbm",
"onnx",
"onnxmltools",
"onnxruntime",
"orbit-ml",
"psutil",
"pyod",
"skl2onnx",
"sktime",
"plotly",
# needed
"cython",
]
)
)
SLOCCOUNT_LANGUAGE_PATTERN = re.compile(r"([a-z]+)=(\d+)")
TARFILE_PATTERN = re.compile(r"\.(tar\.gz|tgz|bz2)$")
NATIVE_FILE_PATTERN = re.compile(r"\.(cc|cxx|cpp|C|CC|c\+\+|c|h|hpp|pyx|rs)$")
NATIVE_API_PATTERN = re.compile(
r"""
pyo3 # Rust package
|
\#include\s+[<"]cppy # cppy
|
\#include\s+[<"]pybind11 # pybind11
|
\#include\s+[<"]Python\.h[>"] # includes Python API
|
\#include\s+[<"]numpy # dependency on numpy API
|
Py_VISIT # macro used in tp_traverse
|
PyStructSequence_Desc # tuple subclass that uses a hack to hide some fields from Python code
|
PyCapsule_Destructor # destructor for capsules
|
PyObject_GC_U?n?Track # potentially (?) problematic GC access
|
PyUnicode_(?:DATA|WRITE|READ) # direct access to unicode void*
|
Py_TYPE\([^\)]+\)\s*=\s*(?!__pyx)[_\(\*_a-zA-Z0-9]+ # assignment to type
|
tp_traverse\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ # own code that runs on GC
|
ob_type\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ # assignment to type
|
tp_bases?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+ # assignment to base and bases
|
tp_finalize?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+
|
tp_del?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+
|
PyTypeObject\s+(?!__pyx)[ \(\*_a-zA-Z0-9_]+\s+=\s+{ # static type definition
""",
re.VERBOSE,
)
PYTHON_API_PATTERN = re.compile("Py_")
def projects():
resp = session.get(
"https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
)
resp.raise_for_status()
return [p["project"] for p in resp.json()["rows"]]
def find_url(proj):
resp = session.get(f"https://pypi.org/pypi/{proj}/json")
try:
for u in resp.json()["urls"]:
if u["packagetype"] == "sdist":
return u["url"]
except:
pass
return EXTRA_URLS.get(proj, None)
def download_sdist(idx, proj):
# Download the sdist file for this project. The file is downloaded into a
# generic name, and a stamp file is created. This is done to avoid any
# requests to PyPI even to get the URL. To actually fetch the most recent
# package, the previous download must be deleted manually.
filename = f"{idx:>05}_{proj}"
if os.path.exists(filename):
print(f"Exists: {filename}")
for f in os.scandir():
if f.is_file() and f.name.startswith(filename) and f.name != filename:
return filename, f.name
url = find_url(proj)
if not url:
# Universal wheel only, maybe.
print(f"Cannot find url for {proj}")
return
stamp = f'{filename}_{url[url.rfind("/") + 1 :]}'
print(f"Saving {stamp} to {filename}")
resp = session.get(url)
resp.raise_for_status()
with open(filename, "wb") as f:
f.write(resp.content)
with open(stamp, "wb") as f:
f.write(b"")
return filename, stamp
class PackageInfo(
namedtuple("PackageInfo", "name interesting_files native_apis sloccount")
):
@classmethod
def new(cls, name, interesting_files, native_apis):
return cls(name, interesting_files, native_apis, [[], 0, []])
@property
def api_sloccount(self):
return self.sloccount[0]
@api_sloccount.setter
def api_sloccount(self, value):
self.sloccount[0] = value
@property
def py_sloccount(self):
return self.sloccount[1]
@py_sloccount.setter
def py_sloccount(self, value):
self.sloccount[1] = value
@property
def total_sloccount(self):
return self.sloccount[2]
@total_sloccount.setter
def total_sloccount(self, value):
self.sloccount[2] = value
def __repr__(self):
extensions = set()
for f in self.interesting_files:
extensions.add(os.path.splitext(f)[1])
extensions = ",".join(extensions)
apis = pprint.pformat(list(self.native_apis.keys()), width=120, indent=1)
if "\n" in apis:
apis = ("\n" + apis).replace("\n", "\n ")
sloccount = ["\n "]
for el in self.total_sloccount:
sloccount.append(el)
sloccount.append(sloccount[0])
if self.api_sloccount:
sloccount.append("In files using C API")
sloccount.append(sloccount[0])
for el in self.api_sloccount:
sloccount.append(el)
sloccount.append(sloccount[0])
if self.py_sloccount:
sloccount.append(f"Counting only lines with 'Py' in them: {self.py_sloccount}")
sloccount.append(sloccount[0])
return f"{self.name}: {extensions} with {apis}\n{''.join(sloccount)}"
def main(projs, prefix=""):
infos = []
for idx, p in enumerate(projs):
try:
filename, stamp = download_sdist(f"{prefix}{idx}", p)
except Exception as e:
traceback.print_exc()
print(f"Failed to download {p}")
continue
if stamp.endswith(".zip"):
archive = zipfile.ZipFile(filename)
names = archive.namelist()
members = names
elif TARFILE_PATTERN.search(stamp):
archive = tarfile.open(filename)
members = archive.getmembers()
names = [m.name for m in members]
else:
print(f"Don't know how to extract {stamp}")
continue
interesting_files = list(
filter(
lambda el: el is not None,
map(
lambda pair: pair[1]
if (
not pair[0].startswith("/")
and ".." not in pair[0]
and NATIVE_FILE_PATTERN.search(pair[0])
)
else None,
zip(names, members),
),
)
)
if interesting_files:
info = PackageInfo.new(name=p, interesting_files=set(), native_apis={})
dirname = f"{filename}.dir"
apidirname = f"{filename}.dir.with_api_usage"
if not os.path.exists(dirname):
os.makedirs(dirname, exist_ok=True)
archive.extractall(path=dirname, members=interesting_files)
chars = list(r"/-\|")
for path in interesting_files:
print("\033[1D", chars[0], sep="", end="", flush=True)
chars = chars[1:] + chars[:1]
name = os.path.join(dirname, getattr(path, "name", path))
with open(name, "rb") as f:
content = f.read().decode("utf-8", errors="replace")
uses_c_api = False
for m in NATIVE_API_PATTERN.finditer(content):
print("\033[1DX\033[1C", end="", flush=True)
info.interesting_files.add(name)
info.native_apis.setdefault(m.group(0), set()).add(name)
uses_c_api = True
uses_c_api = uses_c_api or PYTHON_API_PATTERN.search(content)
if uses_c_api:
sloccountname = os.path.join(apidirname, getattr(path, "name", path))
sloccountnamedir = os.path.dirname(sloccountname)
if not os.path.exists(sloccountnamedir):
os.makedirs(sloccountnamedir, exist_ok=True)
# modify some file endings for sloccount to consider them
if sloccountname.endswith(".pyx"):
sloccountname += ".py"
elif sloccountname.endswith(".rs"):
sloccountname += ".cpp"
shutil.copy(name, sloccountname)
sloccount_sb = []
for line in subprocess.getoutput(f"sloccount {dirname}").split("\n"):
if sloccount_sb:
sloccount_sb.append(line)
break
elif "SLOC-by-Language" in line:
sloccount_sb.append(line)
info.total_sloccount = sloccount_sb
api_sloccount_sb = []
if os.path.exists(apidirname):
for line in subprocess.getoutput(f"sloccount {apidirname}").split("\n"):
if api_sloccount_sb:
api_sloccount_sb.append(line)
break
elif "SLOC-by-Language" in line:
api_sloccount_sb.append(line)
for dirpath,dirnames,filenames in os.walk(apidirname):
print("\033[1D", chars[0], sep="", end="", flush=True)
chars = chars[1:] + chars[:1]
for f in filenames:
if os.path.splitext(f)[1] == ".pyx":
continue
else:
with open(os.path.join(dirpath, f), "r") as file:
for l in file.readlines():
if "Py" in l:
info.py_sloccount = info.py_sloccount + 1
shutil.rmtree(apidirname)
info.api_sloccount = api_sloccount_sb
print()
if info.interesting_files:
infos.append(info)
else:
shutil.rmtree(dirname)
totalSloc = {}
apiSloc = {}
pySloc = 0
print(f"\n{len(infos)} of top {idx} packages found to have C API usage.\n")
for info in infos:
print(info, "\n")
for el in info.total_sloccount:
for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el):
totalSloc[m.group(1)] = totalSloc.get(m.group(1), 0) + int(m.group(2))
for el in info.api_sloccount:
for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el):
apiSloc[m.group(1)] = apiSloc.get(m.group(1), 0) + int(m.group(2))
pySloc += info.py_sloccount
print()
print("Totals")
print("Package SLOC:", totalSloc)
print("API using files SLOC:", apiSloc)
print("Lines with 'Py':", pySloc)
if __name__ == "__main__":
from argparse import ArgumentParser
import sys
parser = ArgumentParser(
description="Analyse the top packages either of PyPI or for data science. Must choose either or!"
)
parser.add_argument("--top5000", action="store_true")
parser.add_argument("--topDS", action="store_true")
args = parser.parse_args(sys.argv[1:])
if args.top5000 and args.topDS or not args.top5000 and not args.topDS:
parser.print_help()
sys.exit(1)
if args.top5000:
main(projects())
else:
main(DS_IMPORTANT_PACKAGES, prefix="DS_")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment