Skip to content

Instantly share code, notes, and snippets.

@kuwa72 kuwa72/scrape.scm
Created Feb 20, 2013

Embed
What would you like to do?
Gauche CGI, Scrape site-titles in LAN nodes with threads. LAN内で動いているWebサイトからTitleを抜いてリンク集にするCGIです。 マルチスレッドのおかげで概ね高速に動作しますが、Cygwinのpthreadでは情報が抜ける場合が多いです。
#!/Gauche/bin/gosh
(use text.html-lite)
(use rfc.http)
(use rfc.uri)
(use www.cgi)
(use sxml.ssax)
(use sxml.sxpath)
(load "htmlprag.scm")
(use srfi-27) ; random-integer
(use srfi-43) ; vector-swap!
(use gauche.threads)
;from http://d.hatena.ne.jp/rui314/20070118/p1
(random-source-randomize! default-random-source)
(define (shuffle lis)
(let1 vec (list->vector lis)
(do ((i (- (vector-length vec) 1) (- i 1)))
((= i 0) (vector->list vec))
(vector-swap! vec i (random-integer (+ i 1))))))
(define (intlist n)
(if (= n 0)
()
(cons n (intlist (- n 1)))))
(define (gettitle host path)
(guard (exc (else ())) ;drop error
(let-values (((status head body) (http-get host path)))
(let ((xml (html->shtml body)))
(list
status
(uri-compose :scheme "http" :host host :path path)
((sxpath "/html/head/title/text()") xml))))))
(define (mkthr a)
(if (null? a)
()
(cons
(thread-start! (make-thread (lambda _(gettitle
(string-join (list "192.168.11." (number->string (car a))) "")
"/"))))
(mkthr (cdr a)))))
(define (main args)
(let ((urls (remove null? (map thread-join! (mkthr (shuffle (intlist 255)))))))
(cgi-main
(lambda (params)
`(,(cgi-header)
,(html-doctype)
,(html:html
(html:head (html:title "list"))
(html:body
(html:ul
(map (lambda (v)
(if (equal? (car v) "200")
(html:li
(html:a :href (car (cdr v))
(cdr (cdr v))
))
()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.