Skip to content

Instantly share code, notes, and snippets.

@lexi-lambda
Created April 12, 2020 19:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lexi-lambda/9ab0ce6e572ae75b3acf668944abbfba to your computer and use it in GitHub Desktop.
Save lexi-lambda/9ab0ce6e572ae75b3acf668944abbfba to your computer and use it in GitHub Desktop.
A hacky script for scraping packages that use arrow notation from Hackage
#lang racket
(require db/base
db/sqlite3
json
net/url
racket/async-channel
threading)
(define WORKERS 6)
(define cabal-path (find-executable-path "cabal"))
(define grep-path (find-executable-path "grep"))
(define (initialize-database!)
(unless (table-exists? the-package-db "packages")
(query-exec the-package-db "CREATE TABLE packages(name TEXT PRIMARY KEY, status TEXT)")
(query-exec the-package-db "CREATE INDEX packages_status ON packages(status)")))
(define (fetch-package-list-from-hackage)
(define packages (call/input-url (string->url "https://hackage.haskell.org/packages/")
(λ (url) (get-pure-port url (list "Accept: application/json")))
read-json))
(for/list ([package (in-list packages)])
(hash-ref package 'packageName)))
(define (prune-package-list packages)
(define values-expr (~> (for/list ([n (in-range (length packages))]) (~a "$" n))
(string-join _ "),(")
(string-append "VALUES(" _ ")")))
(apply query-list the-package-db
(~a "WITH all_packages(name) AS (" values-expr ") "
"SELECT all_packages.name FROM all_packages "
"LEFT JOIN packages ON packages.name = all_packages.name "
"WHERE packages.name IS NULL "
" OR packages.status = 'fail'")
packages))
(define (package-unpack-dir package)
(build-path "/tmp/cabal-packages" package))
(define (unpack-package package)
(define destination (package-unpack-dir package))
(parameterize ([current-input-port (open-input-string "")]
[current-output-port (open-output-nowhere)])
(system* cabal-path "get" "-d" destination package)))
(define (locate-package-dir package)
(define destination (package-unpack-dir package))
(and (directory-exists? destination)
(for/first ([subdir (in-list (directory-list destination))]
#:when (string-prefix? (path->string subdir) package))
(build-path destination subdir))))
(define (get-package-dir package)
(or (locate-package-dir package)
(begin (unpack-package package)
(locate-package-dir package))))
(define (process-worklist initial-worklist worker-proc progress-proc #:workers [workers WORKERS])
(define worklist-box (box initial-worklist))
(define (get-work-item!)
(let loop ()
(match (unbox worklist-box)
[(and worklist (cons item items))
(if (box-cas! worklist-box worklist items)
item
(loop))]
['() #f])))
(define progress-chan (make-async-channel 100))
(define progress-thread
(thread
(λ ()
(let loop ()
(match (async-channel-get progress-chan)
[`#s(result ,worker-id ,item ,result)
(progress-proc worker-id item result)
(loop)]
['done (void)])))))
(define worker-threads
(for/list ([worker-id (in-range workers)])
(thread
(λ ()
(let loop ()
(define item (get-work-item!))
(when item
(define result (with-handlers ([exn:fail? values])
(worker-proc worker-id item)))
(async-channel-put progress-chan `#s(result ,worker-id ,item ,result))
(loop)))))))
(for-each thread-wait worker-threads)
(async-channel-put progress-chan 'done)
(thread-wait progress-thread))
(define the-package-db (sqlite3-connect #:database "/tmp/packages.sqlite3" #:mode 'create #:use-place #t))
(initialize-database!)
(define (do-scan-for-packages!)
(define the-package-list (fetch-package-list-from-hackage))
(define pruned-package-list (prune-package-list the-package-list))
(define num-packages (length the-package-list))
(process-worklist
pruned-package-list
(λ (worker-id package)
(define dir (get-package-dir package))
(parameterize ([current-input-port (open-input-string "")])
(system* grep-path "-REq" "\\bArrows\\b" dir)))
(let ([processed (- num-packages (length pruned-package-list))])
(λ (worker-id package result)
(set! processed (add1 processed))
(define percentage (* (/ processed num-packages) 100))
(query-exec the-package-db
"INSERT OR REPLACE INTO packages(name, status) VALUES($1, $2)"
package
(match result
[#t "yes"]
[#f "no"]
[(? exn?) "fail"]))
(printf "(~a%) ~a ~a\n"
(~r percentage #:precision '(= 2) #:min-width 6)
(match result
[#t " YES"]
[#f " NO"]
[(? exn?) "FAIL"])
package)
(flush-output)))))
(define (find-banana-paths package)
(define dir (get-package-dir package))
(define-values [paths-in paths-out] (make-pipe))
(parameterize ([current-input-port (open-input-string "")]
[current-output-port paths-out])
(system* grep-path "-RFl" "(|" dir))
(close-output-port paths-out)
(for*/list ([matching-path-string (in-lines paths-in)]
[matching-path (in-value (simple-form-path matching-path-string))]
#:unless (member (path-get-extension matching-path)
'(#f #".a" #".bin" #".c" #".class" #".el" #".eot" #".exe" #".hi" #".html" #".idr" #".js" #".md"
#".mp3" #".o" #".pdf" #".png" #".p_o" #".properties" #".ps" #".rsc" #".rst" #".rtree"
#".str" #".tbl" #".ttf" #".xml"))
#:unless (let ([segments (map path->string (explode-path matching-path))])
(ormap (λ (needle) (member needle segments))
'("Agda-2.6.1" ; \ idiom brackets
"idris-1.3.2" ; /
"aern2-mp-0.1.4" ; \ LaTeX
"AlgorithmW-0.1.1.0" ; |
"frown-0.6.2.3" ; |
"HaTeX-3.22.2.0" ; |
"myTestlll-1.0.0" ; |
"TransformersStepByStep-0.1.1.0" ; /
"arrows-0.4.4.2" ; already broken
"Font.hs" ; \ file is raw data
"SfxGong.hs" ; /
; non-implementation code
".git" "examples" "native" "test" "tests" "testdata"
; only uses are parsing Haskell
"arrowp-0.5.0.2" "CCA-0.1.5.3" "fast-tags-2.0.0" "freesect-0.8" "ghc-8.6.5"
"ghc-exactprint-0.6.3" "ghc-lib-8.10.1.20200324" "ghc-lib-parser-8.10.1.20200324"
"haskell-src-exts-1.23.0" "haskell-tools-ast-1.1.1.0" "highlighting-kate-0.6.4"
"module-management-0.21" "ormolu-0.0.3.1" "skylighting-0.8.3.4" "visual-prof-0.5")))
#:when (call-with-input-file* matching-path
(λ (matching-in) (regexp-match? #px"\\(\\|(?![-|<>#$*&=)])" matching-in))))
matching-path))
(define (do-search-for-bananas!)
(define arrow-packages (query-list the-package-db "SELECT name FROM packages WHERE status = 'yes' ORDER BY name ASC"))
(define num-packages (length arrow-packages))
(process-worklist
arrow-packages
(λ (worker-id package)
(not (empty? (find-banana-paths package))))
(let ([processed 0])
(λ (worker-id package result)
(set! processed (add1 processed))
(define percentage (* (/ processed num-packages) 100))
(when (eq? result #t)
(query-exec the-package-db "UPDATE packages SET status = 'bananas' WHERE name = $1" package))
(printf "(~a%) ~a ~a\n"
(~r percentage #:precision '(= 2) #:min-width 6)
(match result
[#t "YES"]
[#f " NO"])
package)
(flush-output)))))
(define (print-banana-paths!)
(define banana-packages (query-list the-package-db "SELECT name FROM packages WHERE status = 'bananas' ORDER BY name ASC"))
(for* ([package (in-list banana-packages)]
[matching-path (in-list (find-banana-paths package))])
(displayln (path->string matching-path))))
;(do-scan-for-packages!)
;(do-search-for-bananas!)
;(print-banana-paths!)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment