Instantly share code, notes, and snippets.

@kuwa72 kuwa72/scrape.scm
Created Feb 20, 2013

What would you like to do?
Gauche CGI, Scrape site-titles in LAN nodes with threads. LAN内で動いているWebサイトからTitleを抜いてリンク集にするCGIです。 マルチスレッドのおかげで概ね高速に動作しますが、Cygwinのpthreadでは情報が抜ける場合が多いです。
(use text.html-lite)
(use rfc.http)
(use rfc.uri)
(use www.cgi)
(use sxml.ssax)
(use sxml.sxpath)
(load "htmlprag.scm")
(use srfi-27) ; random-integer
(use srfi-43) ; vector-swap!
(use gauche.threads)
(random-source-randomize! default-random-source)
(define (shuffle lis)
(let1 vec (list->vector lis)
(do ((i (- (vector-length vec) 1) (- i 1)))
((= i 0) (vector->list vec))
(vector-swap! vec i (random-integer (+ i 1))))))
(define (intlist n)
(if (= n 0)
(cons n (intlist (- n 1)))))
(define (gettitle host path)
(guard (exc (else ())) ;drop error
(let-values (((status head body) (http-get host path)))
(let ((xml (html->shtml body)))
(uri-compose :scheme "http" :host host :path path)
((sxpath "/html/head/title/text()") xml))))))
(define (mkthr a)
(if (null? a)
(thread-start! (make-thread (lambda _(gettitle
(string-join (list "192.168.11." (number->string (car a))) "")
(mkthr (cdr a)))))
(define (main args)
(let ((urls (remove null? (map thread-join! (mkthr (shuffle (intlist 255)))))))
(lambda (params)
(html:head (html:title "list"))
(map (lambda (v)
(if (equal? (car v) "200")
(html:a :href (car (cdr v))
(cdr (cdr v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment