Created
May 13, 2011 21:08
-
-
Save milanaleksic/971326 to your computer and use it in GitHub Desktop.
Download all Vukajlija posters from first 20 pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package vukajlija | |
@Grab(group='org.codehaus.gpars', module='gpars', version='0.11') | |
import java.text.Normalizer | |
import java.util.regex.Pattern | |
import java.util.concurrent.atomic.AtomicInteger | |
import groovyx.gpars.GParsExecutorsPool | |
def final MAX_PAGE = 20 | |
def final targetDir = "d:/temp/_3" | |
def final REGEX_IMAGE = ~'src="([^"]+)"' | |
def list = new ArrayList() | |
def calc(processName, closure) { | |
println "\nStarting process [$processName]" | |
def begin = System.currentTimeMillis() | |
closure() | |
println "Process [$processName] done in ${System.currentTimeMillis() - begin}ms" | |
} | |
def String clean(String str) { | |
String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD); | |
Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); | |
String retValue = pattern.matcher(nfdNormalizedString).replaceAll(""); | |
retValue = retValue.toLowerCase().replaceAll(' ', '-').replaceAll('[,.\\?/\\(\\)„“\\!«»’"]', '-') | |
return retValue.replaceAll('-+','-').replaceAll('-+$','') | |
} | |
calc("filling up") { | |
// filler | |
def out = new StringWriter() | |
GParsExecutorsPool.withPool(10) { | |
(1..MAX_PAGE).eachParallel { strana -> | |
def url = "http://vukajlija.com/zabava/posteri?strana=$strana" | |
println "Fetching image URLs from page $url" | |
out << new URL(url).openStream() | |
out.close() | |
String text = out.toString() | |
text.eachMatch(REGEX_IMAGE) { image, imageFound -> | |
if (imageFound.indexOf('/posters/') != -1) { | |
list << imageFound | |
} | |
} | |
} | |
} | |
} | |
list = new ArrayList(list.unique()) | |
println "Unique Link count: ${list.size()}" | |
AtomicInteger newId = new AtomicInteger(0) | |
// instantiating workers | |
calc("downloading") { | |
GParsExecutorsPool.withPool (10) { | |
list.toArray().eachParallel { imageFound -> | |
if (!Thread.currentThread().name.startsWith("Worker")) | |
Thread.currentThread().name = "Worker${newId.incrementAndGet()}" | |
if (!imageFound) | |
return | |
def filename ="${targetDir}/${clean(imageFound.tokenize('/')[-1])}.jpg" | |
println "Thread ${Thread.currentThread().name} is working on $filename (original is $imageFound)" | |
if (new File(filename).exists()) | |
return | |
def file, buff | |
try { | |
file = new FileOutputStream(filename) | |
println "Downloading ${filename}" | |
buff = new BufferedOutputStream(file) | |
buff << new URL(imageFound).openStream() | |
} finally { | |
file?.close() | |
buff?.close() | |
} | |
println "Downloaded ${filename}" | |
} | |
} | |
} | |
println "Gotovo" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment