timfel/package_c_api_overview.py

## package_c_api_overview.py
from collections import namedtuple
import os
import pprint
import re
import requests
import shutil
import subprocess
import tarfile
import traceback
import zipfile

session = requests.Session()


EXTRA_URLS = {
    # source urls for some projects that have no sdist in PyPI
    "torch": "https://github.com/pytorch/pytorch/archive/refs/tags/v1.10.2.zip",
    "tensorflow": "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.8.0.zip",
    "caffe": "https://github.com/BVLC/caffe/archive/refs/heads/master.zip",
    "torchvision": "https://github.com/pytorch/vision/archive/refs/tags/v0.11.3.zip",
    "keras": "https://github.com/keras-team/keras/archive/refs/tags/v2.8.0.zip",
    "cv2": "https://github.com/opencv/opencv-python/archive/refs/heads/3.4.zip",
    "nimbusml": "https://github.com/microsoft/NimbusML/archive/refs/heads/master.zip",
    "mxnet": "https://github.com/apache/incubator-mxnet/archive/refs/tags/v2.0.0.beta0.rc1.zip",
    "onnxruntime": "https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.10.0.zip",
    "python-util": "https://github.com/MisterL2/python-util/archive/refs/heads/master.zip",
}


DS_IMPORTANT_PACKAGES = list(
    dict.fromkeys(
        [
            # top 10 in Data Science Through the Looking Glass
            "numpy",
            "matplotlib",
            "pandas",
            "scikit-learn",
            "scipy",
            "seaborn",
            "tensorflow",
            # "pylab", # pylab is just matplotlib.pylab
            "requests",
            "statsmodels",
            # top 10 rank change in Data Science Through the Looking Glass
            "torch",
            "keras",
            "xgboost",
            "Pillow",  # called PIL in the paper
            "python-util",
            "cv2",  # really python-opencv
            "tqdm",
            "sqlalchemy",
            "gensim",
            "tensorflow",
            # top 10 pct change in Data Science Through the Looking Glass
            "pandas",
            "matplotlib",
            "scikit-learn",
            "seaborn",
            "keras",
            "torch",
            "numpy",
            "tensorflow",
            "Pillow",  # called PIL in the paper
            "cv2",  # really python-opencv
            # top 5 deep learning imports in Data Science Through the Looking Glass
            "tensorflow",
            "keras",
            "theano",
            "caffe",
            "torch",
            # top 10 imports in Data Science Through the Looking Glass
            "scikit-learn",
            "numpy",
            "matplotlib",
            "pandas",
            "scipy",
            "keras",
            "seaborn",
            "tensorflow",
            "nltk",
            "statsmodels",
            # extras from correlation
            "bs4",
            "torchvision",
            # "selenium", # pure Python
            # release analysis packages from Data Science Through the Looking Glass
            "keras",
            "lasagne",
            "matplotlib",
            "nolearn",
            "numpy",
            "pandas",
            "scikit-learn",
            "scipy",
            "seaborn",
            "nimbusml",
            "mxnet",
            # extra packages
            "category-encoders",
            "dask",
            "imbalanced-learn",
            "lightgbm",
            "onnx",
            "onnxmltools",
            "onnxruntime",
            "orbit-ml",
            "psutil",
            "pyod",
            "skl2onnx",
            "sktime",
            "plotly",
            # needed
            "cython",
        ]
    )
)


SLOCCOUNT_LANGUAGE_PATTERN = re.compile(r"([a-z]+)=(\d+)")
TARFILE_PATTERN = re.compile(r"\.(tar\.gz|tgz|bz2)$")
NATIVE_FILE_PATTERN = re.compile(r"\.(cc|cxx|cpp|C|CC|c\+\+|c|h|hpp|pyx|rs)$")
NATIVE_API_PATTERN = re.compile(
    r"""
    pyo3                                       # Rust package
    |
    \#include\s+[<"]cppy                     # cppy
    |
    \#include\s+[<"]pybind11                     # pybind11
    |
    \#include\s+[<"]Python\.h[>"]                # includes Python API
    |
    \#include\s+[<"]numpy                        # dependency on numpy API
    |
    Py_VISIT                                   # macro used in tp_traverse
    |
    PyStructSequence_Desc                      # tuple subclass that uses a hack to hide some fields from Python code
    |
    PyCapsule_Destructor                       # destructor for capsules
    |
    PyObject_GC_U?n?Track                      # potentially (?) problematic GC access
    |
    PyUnicode_(?:DATA|WRITE|READ)              # direct access to unicode void*
    |
    Py_TYPE\([^\)]+\)\s*=\s*(?!__pyx)[_\(\*_a-zA-Z0-9]+       # assignment to type
    |
    tp_traverse\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+   # own code that runs on GC
    |
    ob_type\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+       # assignment to type
    |
    tp_bases?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+     # assignment to base and bases
    |
    tp_finalize?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+
    |
    tp_del?\s*=\s*(?!__pyx)[ \(\*_a-zA-Z0-9]+
    |
    PyTypeObject\s+(?!__pyx)[ \(\*_a-zA-Z0-9_]+\s+=\s+{       # static type definition
    """,
    re.VERBOSE,
)
PYTHON_API_PATTERN = re.compile("Py_")


def projects():
    resp = session.get(
        "https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
    )
    resp.raise_for_status()
    return [p["project"] for p in resp.json()["rows"]]


def find_url(proj):
    resp = session.get(f"https://pypi.org/pypi/{proj}/json")
    try:
        for u in resp.json()["urls"]:
            if u["packagetype"] == "sdist":
                return u["url"]
    except:
        pass
    return EXTRA_URLS.get(proj, None)


def download_sdist(idx, proj):
    # Download the sdist file for this project. The file is downloaded into a
    # generic name, and a stamp file is created. This is done to avoid any
    # requests to PyPI even to get the URL. To actually fetch the most recent
    # package, the previous download must be deleted manually.
    filename = f"{idx:>05}_{proj}"
    if os.path.exists(filename):
        print(f"Exists: {filename}")
        for f in os.scandir():
            if f.is_file() and f.name.startswith(filename) and f.name != filename:
                return filename, f.name
    url = find_url(proj)
    if not url:
        # Universal wheel only, maybe.
        print(f"Cannot find url for {proj}")
        return
    stamp = f'{filename}_{url[url.rfind("/") + 1 :]}'
    print(f"Saving {stamp} to {filename}")
    resp = session.get(url)
    resp.raise_for_status()
    with open(filename, "wb") as f:
        f.write(resp.content)
    with open(stamp, "wb") as f:
        f.write(b"")
    return filename, stamp


class PackageInfo(
    namedtuple("PackageInfo", "name interesting_files native_apis sloccount")
):
    @classmethod
    def new(cls, name, interesting_files, native_apis):
        return cls(name, interesting_files, native_apis, [[], 0, []])

    @property
    def api_sloccount(self):
        return self.sloccount[0]

    @api_sloccount.setter
    def api_sloccount(self, value):
        self.sloccount[0] = value

    @property
    def py_sloccount(self):
        return self.sloccount[1]

    @py_sloccount.setter
    def py_sloccount(self, value):
        self.sloccount[1] = value

    @property
    def total_sloccount(self):
        return self.sloccount[2]

    @total_sloccount.setter
    def total_sloccount(self, value):
        self.sloccount[2] = value

    def __repr__(self):
        extensions = set()
        for f in self.interesting_files:
            extensions.add(os.path.splitext(f)[1])
        extensions = ",".join(extensions)
        apis = pprint.pformat(list(self.native_apis.keys()), width=120, indent=1)
        if "\n" in apis:
            apis = ("\n" + apis).replace("\n", "\n        ")
        sloccount = ["\n        "]
        for el in self.total_sloccount:
            sloccount.append(el)
            sloccount.append(sloccount[0])
        if self.api_sloccount:
            sloccount.append("In files using C API")
            sloccount.append(sloccount[0])
        for el in self.api_sloccount:
            sloccount.append(el)
            sloccount.append(sloccount[0])
        if self.py_sloccount:
            sloccount.append(f"Counting only lines with 'Py' in them: {self.py_sloccount}")
            sloccount.append(sloccount[0])
        return f"{self.name}: {extensions} with {apis}\n{''.join(sloccount)}"


def main(projs, prefix=""):
    infos = []
    for idx, p in enumerate(projs):
        try:
            filename, stamp = download_sdist(f"{prefix}{idx}", p)
        except Exception as e:
            traceback.print_exc()
            print(f"Failed to download {p}")
            continue
        if stamp.endswith(".zip"):
            archive = zipfile.ZipFile(filename)
            names = archive.namelist()
            members = names
        elif TARFILE_PATTERN.search(stamp):
            archive = tarfile.open(filename)
            members = archive.getmembers()
            names = [m.name for m in members]
        else:
            print(f"Don't know how to extract {stamp}")
            continue
        interesting_files = list(
            filter(
                lambda el: el is not None,
                map(
                    lambda pair: pair[1]
                    if (
                        not pair[0].startswith("/")
                        and ".." not in pair[0]
                        and NATIVE_FILE_PATTERN.search(pair[0])
                    )
                    else None,
                    zip(names, members),
                ),
            )
        )
        if interesting_files:
            info = PackageInfo.new(name=p, interesting_files=set(), native_apis={})
            dirname = f"{filename}.dir"
            apidirname = f"{filename}.dir.with_api_usage"
            if not os.path.exists(dirname):
                os.makedirs(dirname, exist_ok=True)
                archive.extractall(path=dirname, members=interesting_files)
            chars = list(r"/-\|")
            for path in interesting_files:
                print("\033[1D", chars[0], sep="", end="", flush=True)
                chars = chars[1:] + chars[:1]
                name = os.path.join(dirname, getattr(path, "name", path))
                with open(name, "rb") as f:
                    content = f.read().decode("utf-8", errors="replace")
                    uses_c_api = False
                    for m in NATIVE_API_PATTERN.finditer(content):
                        print("\033[1DX\033[1C", end="", flush=True)
                        info.interesting_files.add(name)
                        info.native_apis.setdefault(m.group(0), set()).add(name)
                        uses_c_api = True
                    uses_c_api = uses_c_api or PYTHON_API_PATTERN.search(content)
                if uses_c_api:
                    sloccountname = os.path.join(apidirname, getattr(path, "name", path))
                    sloccountnamedir = os.path.dirname(sloccountname)
                    if not os.path.exists(sloccountnamedir):
                        os.makedirs(sloccountnamedir, exist_ok=True)
                    # modify some file endings for sloccount to consider them
                    if sloccountname.endswith(".pyx"):
                        sloccountname += ".py"
                    elif sloccountname.endswith(".rs"):
                        sloccountname += ".cpp"
                    shutil.copy(name, sloccountname)
            sloccount_sb = []
            for line in subprocess.getoutput(f"sloccount {dirname}").split("\n"):
                if sloccount_sb:
                    sloccount_sb.append(line)
                    break
                elif "SLOC-by-Language" in line:
                    sloccount_sb.append(line)
            info.total_sloccount = sloccount_sb
            api_sloccount_sb = []
            if os.path.exists(apidirname):
                for line in subprocess.getoutput(f"sloccount {apidirname}").split("\n"):
                    if api_sloccount_sb:
                        api_sloccount_sb.append(line)
                        break
                    elif "SLOC-by-Language" in line:
                        api_sloccount_sb.append(line)
                for dirpath,dirnames,filenames in os.walk(apidirname):
                    print("\033[1D", chars[0], sep="", end="", flush=True)
                    chars = chars[1:] + chars[:1]
                    for f in filenames:
                        if os.path.splitext(f)[1] == ".pyx":
                            continue
                        else:
                            with open(os.path.join(dirpath, f), "r") as file:
                                for l in file.readlines():
                                    if "Py" in l:
                                        info.py_sloccount = info.py_sloccount + 1
                shutil.rmtree(apidirname)
            info.api_sloccount = api_sloccount_sb
            print()
            if info.interesting_files:
                infos.append(info)
            else:
                shutil.rmtree(dirname)

    totalSloc = {}
    apiSloc = {}
    pySloc = 0
    print(f"\n{len(infos)} of top {idx} packages found to have C API usage.\n")
    for info in infos:
        print(info, "\n")
        for el in info.total_sloccount:
            for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el):
                totalSloc[m.group(1)] = totalSloc.get(m.group(1), 0) + int(m.group(2))
        for el in info.api_sloccount:
            for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el):
                apiSloc[m.group(1)] = apiSloc.get(m.group(1), 0) + int(m.group(2))
        pySloc += info.py_sloccount
    print()
    print("Totals")
    print("Package SLOC:", totalSloc)
    print("API using files SLOC:", apiSloc)
    print("Lines with 'Py':", pySloc)


if __name__ == "__main__":
    from argparse import ArgumentParser
    import sys

    parser = ArgumentParser(
        description="Analyse the top packages either of PyPI or for data science. Must choose either or!"
    )
    parser.add_argument("--top5000", action="store_true")
    parser.add_argument("--topDS", action="store_true")
    args = parser.parse_args(sys.argv[1:])
    if args.top5000 and args.topDS or not args.top5000 and not args.topDS:
        parser.print_help()
        sys.exit(1)
    if args.top5000:
        main(projects())
    else:
        main(DS_IMPORTANT_PACKAGES, prefix="DS_")
	from collections import namedtuple
	import os
	import pprint
	import re
	import requests
	import shutil
	import subprocess
	import tarfile
	import traceback
	import zipfile

	session = requests.Session()


	EXTRA_URLS = {
	# source urls for some projects that have no sdist in PyPI
	"torch": "https://github.com/pytorch/pytorch/archive/refs/tags/v1.10.2.zip",
	"tensorflow": "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.8.0.zip",
	"caffe": "https://github.com/BVLC/caffe/archive/refs/heads/master.zip",
	"torchvision": "https://github.com/pytorch/vision/archive/refs/tags/v0.11.3.zip",
	"keras": "https://github.com/keras-team/keras/archive/refs/tags/v2.8.0.zip",
	"cv2": "https://github.com/opencv/opencv-python/archive/refs/heads/3.4.zip",
	"nimbusml": "https://github.com/microsoft/NimbusML/archive/refs/heads/master.zip",
	"mxnet": "https://github.com/apache/incubator-mxnet/archive/refs/tags/v2.0.0.beta0.rc1.zip",
	"onnxruntime": "https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.10.0.zip",
	"python-util": "https://github.com/MisterL2/python-util/archive/refs/heads/master.zip",
	}


	DS_IMPORTANT_PACKAGES = list(
	dict.fromkeys(
	[
	# top 10 in Data Science Through the Looking Glass
	"numpy",
	"matplotlib",
	"pandas",
	"scikit-learn",
	"scipy",
	"seaborn",
	"tensorflow",
	# "pylab", # pylab is just matplotlib.pylab
	"requests",
	"statsmodels",
	# top 10 rank change in Data Science Through the Looking Glass
	"torch",
	"keras",
	"xgboost",
	"Pillow", # called PIL in the paper
	"python-util",
	"cv2", # really python-opencv
	"tqdm",
	"sqlalchemy",
	"gensim",
	"tensorflow",
	# top 10 pct change in Data Science Through the Looking Glass
	"pandas",
	"matplotlib",
	"scikit-learn",
	"seaborn",
	"keras",
	"torch",
	"numpy",
	"tensorflow",
	"Pillow", # called PIL in the paper
	"cv2", # really python-opencv
	# top 5 deep learning imports in Data Science Through the Looking Glass
	"tensorflow",
	"keras",
	"theano",
	"caffe",
	"torch",
	# top 10 imports in Data Science Through the Looking Glass
	"scikit-learn",
	"numpy",
	"matplotlib",
	"pandas",
	"scipy",
	"keras",
	"seaborn",
	"tensorflow",
	"nltk",
	"statsmodels",
	# extras from correlation
	"bs4",
	"torchvision",
	# "selenium", # pure Python
	# release analysis packages from Data Science Through the Looking Glass
	"keras",
	"lasagne",
	"matplotlib",
	"nolearn",
	"numpy",
	"pandas",
	"scikit-learn",
	"scipy",
	"seaborn",
	"nimbusml",
	"mxnet",
	# extra packages
	"category-encoders",
	"dask",
	"imbalanced-learn",
	"lightgbm",
	"onnx",
	"onnxmltools",
	"onnxruntime",
	"orbit-ml",
	"psutil",
	"pyod",
	"skl2onnx",
	"sktime",
	"plotly",
	# needed
	"cython",
	]
	)
	)


	SLOCCOUNT_LANGUAGE_PATTERN = re.compile(r"([a-z]+)=(\d+)")
	TARFILE_PATTERN = re.compile(r"\.(tar\.gz\|tgz\|bz2)$")
	NATIVE_FILE_PATTERN = re.compile(r"\.(cc\|cxx\|cpp\|C\|CC\|c\+\+\|c\|h\|hpp\|pyx\|rs)$")
	NATIVE_API_PATTERN = re.compile(
	r"""
	pyo3 # Rust package
	\|
	\#include\s+[<"]cppy # cppy
	\|
	\#include\s+[<"]pybind11 # pybind11
	\|
	\#include\s+[<"]Python\.h[>"] # includes Python API
	\|
	\#include\s+[<"]numpy # dependency on numpy API
	\|
	Py_VISIT # macro used in tp_traverse
	\|
	PyStructSequence_Desc # tuple subclass that uses a hack to hide some fields from Python code
	\|
	PyCapsule_Destructor # destructor for capsules
	\|
	PyObject_GC_U?n?Track # potentially (?) problematic GC access
	\|
	PyUnicode_(?:DATA\|WRITE\|READ) # direct access to unicode void*
	\|
	Py_TYPE\([^\)]+\)\s=\s(?!__pyx)[_\(\*_a-zA-Z0-9]+ # assignment to type
	\|
	tp_traverse\s=\s(?!__pyx)[ \(\*_a-zA-Z0-9]+ # own code that runs on GC
	\|
	ob_type\s=\s(?!__pyx)[ \(\*_a-zA-Z0-9]+ # assignment to type
	\|
	tp_bases?\s=\s(?!__pyx)[ \(\*_a-zA-Z0-9]+ # assignment to base and bases
	\|
	tp_finalize?\s=\s(?!__pyx)[ \(\*_a-zA-Z0-9]+
	\|
	tp_del?\s=\s(?!__pyx)[ \(\*_a-zA-Z0-9]+
	\|
	PyTypeObject\s+(?!__pyx)[ \(\*_a-zA-Z0-9_]+\s+=\s+{ # static type definition
	""",
	re.VERBOSE,
	)
	PYTHON_API_PATTERN = re.compile("Py_")


	def projects():
	resp = session.get(
	"https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.min.json"
	)
	resp.raise_for_status()
	return [p["project"] for p in resp.json()["rows"]]


	def find_url(proj):
	resp = session.get(f"https://pypi.org/pypi/{proj}/json")
	try:
	for u in resp.json()["urls"]:
	if u["packagetype"] == "sdist":
	return u["url"]
	except:
	pass
	return EXTRA_URLS.get(proj, None)


	def download_sdist(idx, proj):
	# Download the sdist file for this project. The file is downloaded into a
	# generic name, and a stamp file is created. This is done to avoid any
	# requests to PyPI even to get the URL. To actually fetch the most recent
	# package, the previous download must be deleted manually.
	filename = f"{idx:>05}_{proj}"
	if os.path.exists(filename):
	print(f"Exists: {filename}")
	for f in os.scandir():
	if f.is_file() and f.name.startswith(filename) and f.name != filename:
	return filename, f.name
	url = find_url(proj)
	if not url:
	# Universal wheel only, maybe.
	print(f"Cannot find url for {proj}")
	return
	stamp = f'{filename}_{url[url.rfind("/") + 1 :]}'
	print(f"Saving {stamp} to {filename}")
	resp = session.get(url)
	resp.raise_for_status()
	with open(filename, "wb") as f:
	f.write(resp.content)
	with open(stamp, "wb") as f:
	f.write(b"")
	return filename, stamp


	class PackageInfo(
	namedtuple("PackageInfo", "name interesting_files native_apis sloccount")
	):
	@classmethod
	def new(cls, name, interesting_files, native_apis):
	return cls(name, interesting_files, native_apis, [[], 0, []])

	@property
	def api_sloccount(self):
	return self.sloccount[0]

	@api_sloccount.setter
	def api_sloccount(self, value):
	self.sloccount[0] = value

	@property
	def py_sloccount(self):
	return self.sloccount[1]

	@py_sloccount.setter
	def py_sloccount(self, value):
	self.sloccount[1] = value

	@property
	def total_sloccount(self):
	return self.sloccount[2]

	@total_sloccount.setter
	def total_sloccount(self, value):
	self.sloccount[2] = value

	def __repr__(self):
	extensions = set()
	for f in self.interesting_files:
	extensions.add(os.path.splitext(f)[1])
	extensions = ",".join(extensions)
	apis = pprint.pformat(list(self.native_apis.keys()), width=120, indent=1)
	if "\n" in apis:
	apis = ("\n" + apis).replace("\n", "\n ")
	sloccount = ["\n "]
	for el in self.total_sloccount:
	sloccount.append(el)
	sloccount.append(sloccount[0])
	if self.api_sloccount:
	sloccount.append("In files using C API")
	sloccount.append(sloccount[0])
	for el in self.api_sloccount:
	sloccount.append(el)
	sloccount.append(sloccount[0])
	if self.py_sloccount:
	sloccount.append(f"Counting only lines with 'Py' in them: {self.py_sloccount}")
	sloccount.append(sloccount[0])
	return f"{self.name}: {extensions} with {apis}\n{''.join(sloccount)}"


	def main(projs, prefix=""):
	infos = []
	for idx, p in enumerate(projs):
	try:
	filename, stamp = download_sdist(f"{prefix}{idx}", p)
	except Exception as e:
	traceback.print_exc()
	print(f"Failed to download {p}")
	continue
	if stamp.endswith(".zip"):
	archive = zipfile.ZipFile(filename)
	names = archive.namelist()
	members = names
	elif TARFILE_PATTERN.search(stamp):
	archive = tarfile.open(filename)
	members = archive.getmembers()
	names = [m.name for m in members]
	else:
	print(f"Don't know how to extract {stamp}")
	continue
	interesting_files = list(
	filter(
	lambda el: el is not None,
	map(
	lambda pair: pair[1]
	if (
	not pair[0].startswith("/")
	and ".." not in pair[0]
	and NATIVE_FILE_PATTERN.search(pair[0])
	)
	else None,
	zip(names, members),
	),
	)
	)
	if interesting_files:
	info = PackageInfo.new(name=p, interesting_files=set(), native_apis={})
	dirname = f"{filename}.dir"
	apidirname = f"{filename}.dir.with_api_usage"
	if not os.path.exists(dirname):
	os.makedirs(dirname, exist_ok=True)
	archive.extractall(path=dirname, members=interesting_files)
	chars = list(r"/-\\|")
	for path in interesting_files:
	print("\033[1D", chars[0], sep="", end="", flush=True)
	chars = chars[1:] + chars[:1]
	name = os.path.join(dirname, getattr(path, "name", path))
	with open(name, "rb") as f:
	content = f.read().decode("utf-8", errors="replace")
	uses_c_api = False
	for m in NATIVE_API_PATTERN.finditer(content):
	print("\033[1DX\033[1C", end="", flush=True)
	info.interesting_files.add(name)
	info.native_apis.setdefault(m.group(0), set()).add(name)
	uses_c_api = True
	uses_c_api = uses_c_api or PYTHON_API_PATTERN.search(content)
	if uses_c_api:
	sloccountname = os.path.join(apidirname, getattr(path, "name", path))
	sloccountnamedir = os.path.dirname(sloccountname)
	if not os.path.exists(sloccountnamedir):
	os.makedirs(sloccountnamedir, exist_ok=True)
	# modify some file endings for sloccount to consider them
	if sloccountname.endswith(".pyx"):
	sloccountname += ".py"
	elif sloccountname.endswith(".rs"):
	sloccountname += ".cpp"
	shutil.copy(name, sloccountname)
	sloccount_sb = []
	for line in subprocess.getoutput(f"sloccount {dirname}").split("\n"):
	if sloccount_sb:
	sloccount_sb.append(line)
	break
	elif "SLOC-by-Language" in line:
	sloccount_sb.append(line)
	info.total_sloccount = sloccount_sb
	api_sloccount_sb = []
	if os.path.exists(apidirname):
	for line in subprocess.getoutput(f"sloccount {apidirname}").split("\n"):
	if api_sloccount_sb:
	api_sloccount_sb.append(line)
	break
	elif "SLOC-by-Language" in line:
	api_sloccount_sb.append(line)
	for dirpath,dirnames,filenames in os.walk(apidirname):
	print("\033[1D", chars[0], sep="", end="", flush=True)
	chars = chars[1:] + chars[:1]
	for f in filenames:
	if os.path.splitext(f)[1] == ".pyx":
	continue
	else:
	with open(os.path.join(dirpath, f), "r") as file:
	for l in file.readlines():
	if "Py" in l:
	info.py_sloccount = info.py_sloccount + 1
	shutil.rmtree(apidirname)
	info.api_sloccount = api_sloccount_sb
	print()
	if info.interesting_files:
	infos.append(info)
	else:
	shutil.rmtree(dirname)

	totalSloc = {}
	apiSloc = {}
	pySloc = 0
	print(f"\n{len(infos)} of top {idx} packages found to have C API usage.\n")
	for info in infos:
	print(info, "\n")
	for el in info.total_sloccount:
	for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el):
	totalSloc[m.group(1)] = totalSloc.get(m.group(1), 0) + int(m.group(2))
	for el in info.api_sloccount:
	for m in SLOCCOUNT_LANGUAGE_PATTERN.finditer(el):
	apiSloc[m.group(1)] = apiSloc.get(m.group(1), 0) + int(m.group(2))
	pySloc += info.py_sloccount
	print()
	print("Totals")
	print("Package SLOC:", totalSloc)
	print("API using files SLOC:", apiSloc)
	print("Lines with 'Py':", pySloc)


	if __name__ == "__main__":
	from argparse import ArgumentParser
	import sys

	parser = ArgumentParser(
	description="Analyse the top packages either of PyPI or for data science. Must choose either or!"
	)
	parser.add_argument("--top5000", action="store_true")
	parser.add_argument("--topDS", action="store_true")
	args = parser.parse_args(sys.argv[1:])
	if args.top5000 and args.topDS or not args.top5000 and not args.topDS:
	parser.print_help()
	sys.exit(1)
	if args.top5000:
	main(projects())
	else:
	main(DS_IMPORTANT_PACKAGES, prefix="DS_")