Skip to content

Instantly share code, notes, and snippets.

@chetanmeh
Last active March 12, 2022 15:40
Show Gist options
  • Save chetanmeh/e67a2e2ff0e8e376ea7779d7b9c430b7 to your computer and use it in GitHub Desktop.
Save chetanmeh/e67a2e2ff0e8e376ea7779d7b9c430b7 to your computer and use it in GitHub Desktop.
package felixconsole
import com.google.common.base.Charsets
import com.google.common.hash.Hashing
import groovy.json.JsonOutput
import groovy.json.JsonSlurper
import org.apache.commons.io.FileUtils
import org.apache.jackrabbit.core.data.DataIdentifier
import org.apache.jackrabbit.core.data.DataRecord
import org.apache.jackrabbit.core.data.FileDataStore
import org.apache.jackrabbit.oak.api.Blob
import org.apache.jackrabbit.oak.api.Type
import org.apache.jackrabbit.oak.commons.PathUtils
import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexConstants
import org.apache.jackrabbit.oak.spi.blob.BlobStore
import org.apache.jackrabbit.oak.spi.state.ChildNodeEntry
import org.apache.jackrabbit.oak.spi.state.NodeBuilder
import org.apache.jackrabbit.oak.spi.state.NodeState
import org.apache.jackrabbit.oak.spi.state.NodeStateUtils
import org.apache.jackrabbit.oak.spi.state.NodeStore
import org.apache.lucene.store.Directory
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.store.IOContext
import org.apache.lucene.store.NoLockFactory
import org.slf4j.LoggerFactory
import static org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState.EMPTY_NODE
import static org.apache.jackrabbit.oak.plugins.nodetype.write.InitialContent.INITIAL_CONTENT
//Directory path under which dump data would be written
def dumpDirPath = 'crx-quickstart/logs/dump'
def cowDir = 'crx-quickstart/repository/index'
def indexDataPath = 'crx-quickstart/logs/index-data.json'
//----------------------<Create ref to private classes/Non Exported>
OakDirectory = loadLuceneClass('org.apache.jackrabbit.oak.plugins.index.lucene.OakDirectory')
IndexDefinition = loadLuceneClass('org.apache.jackrabbit.oak.plugins.index.lucene.IndexDefinition')
//~-------------------------------< Global Fields >
log = LoggerFactory.getLogger("script-console");
ns = getStore()
BlobStore blobStore = getBlobStore()
//~---------------------< Index File Copying >
File indexDataFile = new File(indexDataPath)
assert indexDataFile.exists() : "No file found at ${indexDataFile.absolutePath}"
def indexDataFromRepo = new JsonSlurper().parse(new StringReader(indexDataFile.text))
def dumpDir = new File(dumpDirPath)
FileUtils.forceMkdir(dumpDir)
logMsg("Files would be copied to ${dumpDir.absolutePath}")
File idxData = new File(dumpDir, "local-index-data.json")
def dsPath = "$dumpDirPath/datastore"
FileDataStore ds = new FileDataStore()
ds.path = dsPath
ds.init(null)
File sourceDir = new File(cowDir)
def hashToPathMapping = getIndexPaths()
def output = [:]
sourceDir.listFiles().each {File indexRootDir ->
if (!indexRootDir.isDirectory()){
return
}
String indexDirName = indexRootDir.name
String idxPath = hashToPathMapping[indexDirName] ?: indexDirName
logMsg("Processing $idxPath")
String indexName = PathUtils.getName(idxPath)
def idxDataRepo = indexDataFromRepo[indexName]
if (!idxDataRepo){
logMsg("No index data found for $idxPath in json file")
return
}
if(idxDataRepo.valid){
logMsg("Skipping processing of $idxPath as its valid")
//TODO Renebale
return
}
def fileData = [:]
indexRootDir.listFiles().each { File indexDir ->
Directory sourceIdxDir = FSDirectory.open(indexDir, NoLockFactory.noLockFactory)
sourceIdxDir.listAll().each { String fileName ->
//Determine the uniqueKey for the index file
def fileDataRepo = idxDataRepo.files.get(fileName)
if (fileDataRepo.isEmpty()){
logMsg("Skipping processing of $idxPath/$fileName as no data found in index data json")
return
}
String key = fileDataRepo[0].key
def idxDefn = IndexDefinition.newInstance(INITIAL_CONTENT, EMPTY_NODE, null)
NodeBuilder nb = EMPTY_NODE.builder()
//Seed in the key
nb.child(':data').child(fileName).setProperty('uniqueKey', key)
nb.child(':data').child(fileName).setProperty('blobSize', idxDefn.getBlobSize())
Directory dir = OakDirectory.newInstance(nb, idxDefn, false)
sourceIdxDir.copy(dir, fileName, fileName, IOContext.READ)
NodeBuilder fileNode = nb.getChildNode(':data').getChildNode(fileName)
def binaries = fileNode.getProperty('jcr:data').getValue(Type.BINARIES)
logMsg("\tCopying $fileName")
def blobIds = [:]
long srcSizes = 0, destSized = 0
binaries.each { Blob b ->
InputStream is = b.newStream
DataRecord dr = ds.addRecord(is)
is.close()
def blobid = dr.identifier.toString()
logMsg("\t\tCreated blob $blobid, Size ${dr.length}")
blobIds[blobid] = [size:dr.length, originalSize: b.length()]
srcSizes += b.length() as long
destSized += dr.length as long
}
//assert srcSizes == destSized
//assert srcSizes == sourceIdxDir.fileLength(fileName)
fileData[fileName] = blobIds
}
}
output[idxPath] = [files: fileData]
}
idxData.withWriter {w ->
w.print(JsonOutput.prettyPrint(JsonOutput.toJson(output)))
}
/*
"cqPageLucene": {
"files": {
"_0.si": [
{
"blobId": "f76dedbc-b66b-4a3f-af95-45b787d3c917.d2bb",
"valid": true,
"key": "ab900a4ca931cb8cc6dd2bd5b79b6df0"
}
],
"_0.cfe": [
{
"blobId": "f76dedbc-b66b-4a3f-af95-45b787d3c917.d306",
"valid": true,
"key": "d333c042915ae96494b76836ed3f3aba"
}
]
*/
int invalidBlobCount = 0
int recoveredBlobCount = 0
indexDataFromRepo.each {idxName, data ->
data.files.each{fileName, fileData ->
fileData.each{ blobData ->
String blobId = blobData.blobId
if (blobId.contains('#') && !blobData.valid){
invalidBlobCount ++
String fileId = blobId.substring(0, blobId.indexOf('#'))
def stored = ds.getRecordIfStored(new DataIdentifier(fileId))
logMsg("BlobId : $fileId, Recovered ${stored}")
if (stored){
recoveredBlobCount++
}
}
}
}
}
logMsg("invalidBlobCount: $invalidBlobCount, recovered: $recoveredBlobCount")
logMsg("Done copying")
Object getIndexPaths() {
def map = [:]
collectIndexData("", map)
collectIndexData("/content", map)
return map
}
def collectIndexData(String basePath, def map){
NodeStore ns = getStore()
NodeState oakIndex = NodeStateUtils.getNode(ns.root, "$basePath/oak:index")
oakIndex.childNodeEntries.each {ChildNodeEntry cne ->
if (cne.getNodeState().getString('type') == 'lucene'){
String idxPath = "$basePath/oak:index/${cne.name}"
String hash = Hashing.sha256().hashString(idxPath, Charsets.UTF_8).toString()
map[hash] = idxPath
}
}
}
def logMsg(String msg){
println msg
log.info(msg)
}
Class loadLuceneClass(String className){
return LuceneIndexConstants.class.classLoader.loadClass(className)
}
def getStore(){
osgi.getService(org.apache.sling.jcr.api.SlingRepository.class).manager.store
}
def getBlobStore(){
osgi.getService(org.apache.jackrabbit.oak.spi.blob.BlobStore.class)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment