Skip to content

Instantly share code, notes, and snippets.

@pauljm
Last active June 28, 2019 21:39
Show Gist options
  • Save pauljm/870f61f74a1c4491cf1e to your computer and use it in GitHub Desktop.
Save pauljm/870f61f74a1c4491cf1e to your computer and use it in GitHub Desktop.
Quick and dirty utility to list all the objects in an S3 bucket with a certain prefix and, for any whose key matches a pattern, read the file line by line and print any lines that match a second pattern. Adjust constants as appropriate. Usage: sbt 'run <AWS access key ID> <AWS secret key>'
organization := "com.conspire"
name := "s3inspect"
libraryDependencies ++= Seq("com.amazonaws" % "aws-java-sdk" % "1.9.28.1")
/**
* Directory structure should be:
* - project_root
* - build.sbt (see below)
* - src
* - main
* - scala
* - S3Inspect.scala
*/
package s3inspect
import scala.collection.JavaConversions._
import java.io.{ InputStream, InputStreamReader, BufferedReader, ByteArrayInputStream }
import java.util.regex.Pattern
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model._
/**
* Quick and dirty utility to list all the objects in a bucket with a certain prefix
* and, for any whose key matches a pattern, read the file line by line and print
* any lines that match a second pattern. Adjust constants as appropriate.
*
* Usage: sbt 'run <AWS access key ID> <AWS secret key>'
*/
object S3Inspect {
val BucketName = "my-bucket-name"
val ObjectPrefix = "prefix/to/haystack/"
val ObjectPattern = Pattern.compile(s"${ ObjectPrefix }[0-9]+\\.txt")
val LinePattern = Pattern.compile("a|(some)needles?")
def main(args: Array[String]) {
// AWS credentials are supplied at the command line
if (args.length != 2) {
println("Usage: sbt 'run <AWS access key ID> <AWS secret key>'")
System.exit(1)
}
val s3 = new AmazonS3Client( new BasicAWSCredentials( args(0), args(1) ) )
inspectBucket(s3)
}
/**
* List objects with the specified prefix and, for those that match the (whole) object pattern,
* check lines against the line pattern
*/
def inspectBucket(s3: AmazonS3Client) {
// Pool of 10 workers to process objecst in parallel
val tasksupport = new scala.collection.parallel.ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(10))
// Keep listing until all objects listed
val listObjectsRequest = new ListObjectsRequest().withBucketName(BucketName).withPrefix(ObjectPrefix)
var listing: ObjectListing = null
var matchCount = 0
do {
listing = s3.listObjects(listObjectsRequest)
// Use parallel collections to process several objects in parallel
val parallelSummaries = listing.getObjectSummaries.par
parallelSummaries.tasksupport = tasksupport
parallelSummaries.foreach { objectSummary =>
// Inspect matching objects and track total objects inspected
if ( ObjectPattern.matcher( objectSummary.getKey ).matches ) {
inspectObject(s3, objectSummary.getKey)
matchCount += 1
if (matchCount % 100 == 0)
println(s"...$matchCount matching objects processed...")
}
}
listObjectsRequest.setMarker(listing.getNextMarker)
} while ( listing.isTruncated )
}
/**
* Read object and print lines that include (don't have to match completely) the line pattern
*/
def inspectObject(s3: AmazonS3Client, key: String) {
val s3Object = s3.getObject(BucketName, key)
val is = s3Object.getObjectContent
val r = new InputStreamReader(is, "UTF-8")
val br = new BufferedReader(r)
var line: String = null
var afterEmptyLine = false
do {
line = br.readLine()
if (line != null && LinePattern.matcher( line ).find) {
println(s"$key: $line")
if (afterEmptyLine)
println(" ^^^ AFTER EMPTY LINE ^^^")
} else if (line == "")
afterEmptyLine = true
} while (line != null)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment