Skip to content

Instantly share code, notes, and snippets.

@gregtaole
Created June 18, 2018 10:12
Show Gist options
  • Save gregtaole/046859ab26a9a0ed4ea85c3ac46688b8 to your computer and use it in GitHub Desktop.
Save gregtaole/046859ab26a9a0ed4ea85c3ac46688b8 to your computer and use it in GitHub Desktop.
Scrape https://docs.python.org/3/py-modindex.html to get list of python modules and then use the generated list to display dependencies of a python project
package main
import (
"bufio"
"flag"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"sync"
)
var wg sync.WaitGroup
var pyStdlib = []string{
"__future__",
"__main__",
"_dummy_thread",
"_thread",
"abc",
"aifc",
"argparse",
"array",
"ast",
"asynchat",
"asyncio",
"asyncore",
"atexit",
"audioop",
"base64",
"bdb",
"binascii",
"binhex",
"bisect",
"builtins",
"bz2",
"calendar",
"cgi",
"cgitb",
"chunk",
"cmath",
"cmd",
"code",
"codecs",
"codeop",
"collections",
"collections.abc",
"colorsys",
"compileall",
"concurrent",
"concurrent.futures",
"configparser",
"contextlib",
"copy",
"copyreg",
"cProfile",
"crypt",
"csv",
"ctypes",
"curses",
"curses.ascii",
"curses.panel",
"curses.textpad",
"datetime",
"dbm",
"dbm.dumb",
"dbm.gnu",
"dbm.ndbm",
"decimal",
"difflib",
"dis",
"distutils",
"distutils.archive_util",
"distutils.bcppcompiler",
"distutils.ccompiler",
"distutils.cmd",
"distutils.command",
"distutils.command.bdist",
"distutils.command.bdist_dumb",
"distutils.command.bdist_msi",
"distutils.command.bdist_packager",
"distutils.command.bdist_rpm",
"distutils.command.bdist_wininst",
"distutils.command.build",
"distutils.command.build_clib",
"distutils.command.build_ext",
"distutils.command.build_py",
"distutils.command.build_scripts",
"distutils.command.check",
"distutils.command.clean",
"distutils.command.config",
"distutils.command.install",
"distutils.command.install_data",
"distutils.command.install_headers",
"distutils.command.install_lib",
"distutils.command.install_scripts",
"distutils.command.register",
"distutils.command.sdist",
"distutils.core",
"distutils.cygwinccompiler",
"distutils.debug",
"distutils.dep_util",
"distutils.dir_util",
"distutils.dist",
"distutils.errors",
"distutils.extension",
"distutils.fancy_getopt",
"distutils.file_util",
"distutils.filelist",
"distutils.log",
"distutils.msvccompiler",
"distutils.spawn",
"distutils.sysconfig",
"distutils.text_file",
"distutils.unixccompiler",
"distutils.util",
"distutils.version",
"doctest",
"dummy_threading",
"email",
"email.charset",
"email.contentmanager",
"email.encoders",
"email.errors",
"email.generator",
"email.header",
"email.headerregistry",
"email.iterators",
"email.message",
"email.mime",
"email.parser",
"email.policy",
"email.utils",
"encodings",
"encodings.idna",
"encodings.mbcs",
"encodings.utf_8_sig",
"ensurepip",
"enum",
"errno",
"faulthandler",
"fcntl",
"filecmp",
"fileinput",
"fnmatch",
"formatter",
"fpectl",
"fractions",
"ftplib",
"functools",
"gc",
"getopt",
"getpass",
"gettext",
"glob",
"grp",
"gzip",
"hashlib",
"heapq",
"hmac",
"html",
"html.entities",
"html.parser",
"http",
"http.client",
"http.cookiejar",
"http.cookies",
"http.server",
"imaplib",
"imghdr",
"imp",
"importlib",
"importlib.abc",
"importlib.machinery",
"importlib.util",
"inspect",
"io",
"ipaddress",
"itertools",
"json",
"json.tool",
"keyword",
"lib2to3",
"linecache",
"locale",
"logging",
"logging.config",
"logging.handlers",
"lzma",
"macpath",
"mailbox",
"mailcap",
"marshal",
"math",
"mimetypes",
"mmap",
"modulefinder",
"msilib",
"msvcrt",
"multiprocessing",
"multiprocessing.connection",
"multiprocessing.dummy",
"multiprocessing.managers",
"multiprocessing.pool",
"multiprocessing.sharedctypes",
"netrc",
"nis",
"nntplib",
"numbers",
"operator",
"optparse",
"os",
"os.path",
"ossaudiodev",
"parser",
"pathlib",
"pdb",
"pickle",
"pickletools",
"pipes",
"pkgutil",
"platform",
"plistlib",
"poplib",
"posix",
"pprint",
"profile",
"pstats",
"pty",
"pwd",
"py_compile",
"pyclbr",
"pydoc",
"queue",
"quopri",
"random",
"re",
"readline",
"reprlib",
"resource",
"rlcompleter",
"runpy",
"sched",
"secrets",
"select",
"selectors",
"shelve",
"shlex",
"shutil",
"signal",
"site",
"smtpd",
"smtplib",
"sndhdr",
"socket",
"socketserver",
"spwd",
"sqlite3",
"ssl",
"stat",
"statistics",
"string",
"stringprep",
"struct",
"subprocess",
"sunau",
"symbol",
"symtable",
"sys",
"sysconfig",
"syslog",
"tabnanny",
"tarfile",
"telnetlib",
"tempfile",
"termios",
"test",
"test.support",
"textwrap",
"threading",
"time",
"timeit",
"tkinter",
"tkinter.scrolledtext",
"tkinter.tix",
"tkinter.ttk",
"token",
"tokenize",
"trace",
"traceback",
"tracemalloc",
"tty",
"turtle",
"turtledemo",
"types",
"typing",
"unicodedata",
"unittest",
"unittest.mock",
"urllib",
"urllib.error",
"urllib.parse",
"urllib.request",
"urllib.response",
"urllib.robotparser",
"uu",
"uuid",
"venv",
"warnings",
"wave",
"weakref",
"webbrowser",
"winreg",
"winsound",
"wsgiref",
"wsgiref.handlers",
"wsgiref.headers",
"wsgiref.simple_server",
"wsgiref.util",
"wsgiref.validate",
"xdrlib",
"xml",
"xml.dom",
"xml.dom.minidom",
"xml.dom.pulldom",
"xml.etree.ElementTree",
"xml.parsers.expat",
"xml.parsers.expat.errors",
"xml.parsers.expat.model",
"xml.sax",
"xml.sax.handler",
"xml.sax.saxutils",
"xml.sax.xmlreader",
"xmlrpc",
"xmlrpc.client",
"xmlrpc.server",
"zipapp",
"zipfile",
"zipimport",
"zlib",
}
func main() {
pathFlag := flag.String("d", ".", "Path to the directory containing the python source files")
excludeFlag := flag.String("e", "__pycache__", "Comma-separated list of directories to exclude")
flag.Parse()
excludeDirs := strings.Split(*excludeFlag, ",")
pyFiles := make([]string, 0)
err := filepath.Walk(*pathFlag, func(path string, info os.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("could not read filepath %q : %v", *pathFlag, err)
}
for _, dir := range excludeDirs {
if info.IsDir() && info.Name() == dir {
return filepath.SkipDir
}
}
matched, err := regexp.MatchString(".py", path)
if err != nil {
return fmt.Errorf("error applying regular expression to %q : %v", path, err)
}
if matched {
pyFiles = append(pyFiles, path)
}
return nil
})
if err != nil {
fmt.Printf("error while walking the directory tree at %q : %v", *pathFlag, err)
}
importsChan := make(chan string)
errChan := make(chan error)
wg.Add(len(pyFiles))
for _, file := range pyFiles {
go findImports(file, importsChan, errChan)
}
imports := make([]string, 0)
go func() {
for val := range importsChan {
imports = append(imports, val)
}
}()
go func() {
for err := range errChan {
fmt.Fprintf(os.Stderr, "%v", err)
}
}()
wg.Wait()
packagesChan := make(chan string)
packages := make([]string, 0)
for _, importString := range imports {
wg.Add(1)
go ParseImports(importString, packagesChan, errChan)
}
go func() {
for pack := range packagesChan {
packages = append(packages, pack)
}
}()
wg.Wait()
uniq := unique(packages)
clean := removeStdlibAndUser(uniq, pyFiles)
for _, mod := range clean {
fmt.Println(mod)
}
}
func findImports(filePath string, importsChan chan<- string, errChan chan<- error) {
defer wg.Done()
file, err := os.Open(filePath)
if err != nil {
errChan <- fmt.Errorf("could open file %v for reading : %v", filePath, err)
return
}
scanner := bufio.NewScanner(file)
for scanner.Scan() {
text := scanner.Text()
matches, err := regexp.MatchString("^import|from.*import", text)
if err != nil {
errChan <- fmt.Errorf("error while parsing regular expression ^import|from.*import : %v", err)
return
}
if matches {
importsChan <- text
}
}
if scanner.Err() != nil {
errChan <- fmt.Errorf("error while scanning %v : %v", filePath, err)
}
}
/*
ParseImports extracts the name of the python library contained in importString
*/
func ParseImports(importString string, packagesChan chan<- string, errChan chan<- error) {
defer wg.Done()
imp := strings.Split(importString, " ")[1]
if strings.Contains(imp, ".") {
module := strings.Split(imp, ".")[0]
packagesChan <- module
return
}
packagesChan <- imp
}
func unique(imports []string) []string {
uniqueImports := make(map[string]bool)
for _, imp := range imports {
_, ok := uniqueImports[imp]
if !ok {
uniqueImports[imp] = true
}
}
uniqueImportsList := make([]string, 0)
for key := range uniqueImports {
uniqueImportsList = append(uniqueImportsList, key)
}
return uniqueImportsList
}
func removeStdlibAndUser(uniq, pyFiles []string) []string {
clean := make([]string, 0)
for _, mod := range uniq {
found := false
for _, ex := range pyStdlib {
if mod == ex {
found = true
}
}
for _, ex := range pyFiles {
if strings.Contains(ex, mod) {
found = true
}
}
if !found {
clean = append(clean, mod)
}
}
return clean
}
package main
import (
"bytes"
"errors"
"fmt"
"io"
"log"
"net/http"
"regexp"
"golang.org/x/net/html"
)
const url = "https://docs.python.org/3/py-modindex.html"
func main() {
resp, err := http.Get(url)
if err != nil {
log.Fatalf("could not get url %v : %v", url, err)
}
defer resp.Body.Close()
doc, err := html.Parse(resp.Body)
if err != nil {
log.Fatalf("could not parse response body : %v", err)
}
content, err := getContent(doc)
if err != nil {
log.Fatalf("could not get content from parsed document : %v", err)
}
fmt.Println("[]string{")
re := regexp.MustCompile("(<code class=\"xref\">)|(</code>)")
for _, mod := range content {
fmt.Printf("\t\"%v\",\n", re.ReplaceAllString(string(renderNode(mod)), ""))
}
fmt.Println("}")
}
func getContent(doc *html.Node) ([]*html.Node, error) {
modules := make([]*html.Node, 0)
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode {
for _, attr := range n.Attr {
if attr.Val == "xref" {
modules = append(modules, n)
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
if len(modules) == 0 {
return nil, errors.New("\"code\" tag not found")
}
return modules, nil
}
func renderNode(n *html.Node) []byte {
var buf bytes.Buffer
w := io.Writer(&buf)
html.Render(w, n)
return buf.Bytes()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment