Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save milanaleksic/971326 to your computer and use it in GitHub Desktop.
Save milanaleksic/971326 to your computer and use it in GitHub Desktop.
Download all Vukajlija posters from first 20 pages
package vukajlija
@Grab(group='org.codehaus.gpars', module='gpars', version='0.11')
import java.text.Normalizer
import java.util.regex.Pattern
import java.util.concurrent.atomic.AtomicInteger
import groovyx.gpars.GParsExecutorsPool
def final MAX_PAGE = 20
def final targetDir = "d:/temp/_3"
def final REGEX_IMAGE = ~'src="([^"]+)"'
def list = new ArrayList()
def calc(processName, closure) {
println "\nStarting process [$processName]"
def begin = System.currentTimeMillis()
closure()
println "Process [$processName] done in ${System.currentTimeMillis() - begin}ms"
}
def String clean(String str) {
String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
String retValue = pattern.matcher(nfdNormalizedString).replaceAll("");
retValue = retValue.toLowerCase().replaceAll(' ', '-').replaceAll('[,.\\?/\\(\\)„“\\!«»’"]', '-')
return retValue.replaceAll('-+','-').replaceAll('-+$','')
}
calc("filling up") {
// filler
def out = new StringWriter()
GParsExecutorsPool.withPool(10) {
(1..MAX_PAGE).eachParallel { strana ->
def url = "http://vukajlija.com/zabava/posteri?strana=$strana"
println "Fetching image URLs from page $url"
out << new URL(url).openStream()
out.close()
String text = out.toString()
text.eachMatch(REGEX_IMAGE) { image, imageFound ->
if (imageFound.indexOf('/posters/') != -1) {
list << imageFound
}
}
}
}
}
list = new ArrayList(list.unique())
println "Unique Link count: ${list.size()}"
AtomicInteger newId = new AtomicInteger(0)
// instantiating workers
calc("downloading") {
GParsExecutorsPool.withPool (10) {
list.toArray().eachParallel { imageFound ->
if (!Thread.currentThread().name.startsWith("Worker"))
Thread.currentThread().name = "Worker${newId.incrementAndGet()}"
if (!imageFound)
return
def filename ="${targetDir}/${clean(imageFound.tokenize('/')[-1])}.jpg"
println "Thread ${Thread.currentThread().name} is working on $filename (original is $imageFound)"
if (new File(filename).exists())
return
def file, buff
try {
file = new FileOutputStream(filename)
println "Downloading ${filename}"
buff = new BufferedOutputStream(file)
buff << new URL(imageFound).openStream()
} finally {
file?.close()
buff?.close()
}
println "Downloaded ${filename}"
}
}
}
println "Gotovo"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment