Skip to content

Instantly share code, notes, and snippets.

@milanaleksic
Created January 6, 2012 14:58
Show Gist options
  • Save milanaleksic/1570948 to your computer and use it in GitHub Desktop.
Save milanaleksic/1570948 to your computer and use it in GitHub Desktop.
Find bad URLs in exported Chrome bookmarks file
package misc
String bookmarksFile = 'c:/temp/bookmarks.html'
def counter = new java.util.concurrent.atomic.AtomicInteger(0)
Map<String, String> urls = [:]
Map<String, String> badURLs = [:]
new File(bookmarksFile).eachLine { line ->
(line =~ /(?i)<A HREF="([^"]+)"[^>]*>([^<]*)<.*/).each { entire, linkLocation, linkName ->
if (urls.containsValue(linkLocation))
println "WARNING link duplicated: [$linkName] $linkLocation"
if (urls.containsKey(linkName))
urls[linkName + UUID.randomUUID()] = linkLocation
else
urls[linkName] = linkLocation
}
}
println "Links found and to be processed: ${urls.size()}"
groovyx.gpars.GParsExecutorsPool.withPool(10) {
urls.eachParallel { linkName, url ->
try {
println "Analyzing url ${counter.incrementAndGet()}/${urls.size()} $url"
def connection = new URL(url).openConnection()
connection.connectTimeout = 15000
connection.readTimeout = 15000
connection.connect()
if (!(connection.getHeaderField(null) ==~ /.*200 OK/)) {
badURLs[linkName] = url
println "\tUrl $url added to bad URL list because of response code: ${connection.getHeaderField(null)}"
}
} catch (Throwable t) {
println "\tIssue found while accessing [$url]: ${t.getMessage()}"
badURLs[linkName] = url
}
}
}
println '\n\n--------------\nBad URLs:'
badURLs.each { linkName, url ->
println "\t$linkName [$url]"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment