Skip to content

Instantly share code, notes, and snippets.

@chetanmeh
Created June 3, 2015 09:02
Show Gist options
  • Save chetanmeh/be66363172532e09ee7d to your computer and use it in GitHub Desktop.
Save chetanmeh/be66363172532e09ee7d to your computer and use it in GitHub Desktop.
Script to generate csv file containing the binary related metadata required for https://issues.apache.org/jira/browse/OAK-2953
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import groovy.text.SimpleTemplateEngine
import org.apache.jackrabbit.api.JackrabbitValue
import org.apache.jackrabbit.api.ReferenceBinary
import org.apache.jackrabbit.commons.visitor.FilteringItemVisitor
import org.apache.sling.jcr.api.SlingRepository
import org.slf4j.LoggerFactory
import javax.jcr.Binary
import javax.jcr.Node
import javax.jcr.Property
import javax.jcr.Session
import static org.apache.jackrabbit.JcrConstants.*
import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount
//Set it to root under which binaries need to be checked
def root = '/content'
//Path on server side where CSV file needs to be created
File outFile = new File('binary-stats.csv');
SlingRepository repo = osgi.getService(SlingRepository.class)
Session s = null
logger = LoggerFactory.getLogger("script-console")
try {
s = repo.loginAdministrative(null)
def typeStatsMap = [:].withDefault {key -> [count:0, name:key, size:0L]}
"".center()
if (outFile){
outFile.withPrintWriter {pw ->
collectBinaryStats(s.getNode(root), typeStatsMap){ b ->
// BLOB_ID, LENGTH, JCR_MIMETYPE, JCR_ENCODING, JCR_PATH
pw.println ([b.blobId, b.binarySize, quote(b.mimeType), quote(b.encoding), quote(b.path)].join(','))
}
}
logp ("Created the csv file at ${outFile.absolutePath}")
} else {
collectBinaryStats(s.getNode(root), typeStatsMap){}
}
def totalSize = typeStatsMap.values().inject(0) {sum, stat -> sum + stat.size}
logp("MimeType Stats - Total size ${humanReadableByteCount(totalSize)}")
dumpStats(typeStatsMap)
} finally {
s?.logout()
}
def collectBinaryStats(Node root, def typeStatsMap, Closure binaryCallback){
def nodes = []
long count = 0
def blobStore = fetchBlobStore()
FilteringItemVisitor v = new FilteringItemVisitor(){
protected void entering(Property property, int i){ }
protected void leaving(Property property, int i){ }
protected void leaving(Node node, int i){ }
protected void entering(Node node, int i){
count++
if(node.hasProperty(JCR_MIMETYPE) && node.hasProperty(JCR_DATA)){
def mimeType = node.getProperty(JCR_MIMETYPE).getString()
def stats = typeStatsMap[mimeType]
def binarySize = ""
def blobId
def encoding = node.hasProperty(JCR_ENCODING) ? node.getProperty(JCR_ENCODING).getString() : null
Binary b = node.getProperty('jcr:data').getBinary()
try{
binarySize = b.size
if (b instanceof JackrabbitValue){
blobId = b.contentIdentity
} else if (blobStore && b instanceof ReferenceBinary){
def ref = b.reference
blobId = ref ? blobStore.getBlobId(b.reference) : null
} else if (b.class.name == 'org.apache.jackrabbit.oak.plugins.value.BinaryImpl'){
//Oak Binary does not implements JackrabbitValue
blobId = b.value?.blob?.contentIdentity
}
stats.size += binarySize
}finally{
b.dispose()
}
if (blobId){
binaryCallback([
blobId:blobId,
binarySize: binarySize,
mimeType: mimeType,
encoding: encoding,
path: node.path]
)
}
stats.count++
}
if (count % 10000L == 0L){
logp ("Traversed $count nodes so far")
}
if (count % 1000000L == 0L){
dumpStats(typeStatsMap)
}
}
}
root.accept(v)
return nodes
}
def dumpStats(def typeStatsMap){
def stats = new ArrayList(typeStatsMap.values())
stats.each {stat -> stat.sizeString = humanReadableByteCount(stat.size)}
stats.sort {-it.size}
def columns = [
[name:"name",displayName:"Name",size:40],
[name:"count",displayName:"Count",size:10],
[name:"sizeString",displayName:"Size",size:10],
[name:"size",displayName:"Size",size:10],
]
def ttf = new TemplateFactory2()
ttf.columns = columns
def table = new SimpleTemplateEngine().createTemplate(ttf.template).make([rows:stats]).toString()
logp(table)
}
def logp(def msg){
println msg
logger.info(msg.toString())
}
def quote(String s){
return s ? "\"${s.replaceAll("\"",'""')}\"" : ""
}
class TemplateFactory2 {
def columns = []
def getTemplate() { """
${columns.collect{ " <%print \"$it.displayName\".center($it.size)%> " }.join()}
${columns.collect{ " <%print \"_\"*$it.size %> " }.join()}
<% rows.each {%>${columns.collect{ " \${it.${it.name}.toString().padRight($it.size).substring(0,$it.size)} " }.join()}
<% } %>"""
}
}
def fetchBlobStore() {
try{
return osgi.getService(org.apache.jackrabbit.oak.spi.blob.BlobStore.class)
}catch(Throwable ignore){
//Running in JR2
}
return null
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment