jfrazee/FilterBadGzipFiles.scala

## FilterBadGzipFiles.scala
import java.io._
import scala.io._
import java.util.zip._

// Spark
import org.slf4j.Logger
import org.apache.spark.{ SparkConf, SparkContext, Logging }

// Hadoop
import org.apache.hadoop.io.compress.GzipCodec

object FilterBadGzipFiles extends Logging {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
    val sc = new SparkContext(sparkConf)

    val files = sc.binaryFiles(args(0))

    val lines =
      files.flatMap {
        case (path, stream) =>
          try {
            val is =
              if (path.toLowerCase.endsWith(".gz"))
                new GZIPInputStream(stream.open)
              else
                stream.open

            try {
              Source.fromInputStream(is).getLines.toList
            } finally {
              try { is.close } catch { case _: Throwable => }
            }
          } catch {
            case e: Throwable =>
              log.warn(s"error reading from ${path}: ${e.getMessage}", e)
              List.empty[String]
          }
      }

    lines.saveAsTextFile(args(1), classOf[GzipCodec])
  }
}
	import java.io._
	import scala.io._
	import java.util.zip._

	// Spark
	import org.slf4j.Logger
	import org.apache.spark.{ SparkConf, SparkContext, Logging }

	// Hadoop
	import org.apache.hadoop.io.compress.GzipCodec

	object FilterBadGzipFiles extends Logging {
	def main(args: Array[String]): Unit = {
	val sparkConf = new SparkConf()
	val sc = new SparkContext(sparkConf)

	val files = sc.binaryFiles(args(0))

	val lines =
	files.flatMap {
	case (path, stream) =>
	try {
	val is =
	if (path.toLowerCase.endsWith(".gz"))
	new GZIPInputStream(stream.open)
	else
	stream.open

	try {
	Source.fromInputStream(is).getLines.toList
	} finally {
	try { is.close } catch { case _: Throwable => }
	}
	} catch {
	case e: Throwable =>
	log.warn(s"error reading from ${path}: ${e.getMessage}", e)
	List.empty[String]
	}
	}

	lines.saveAsTextFile(args(1), classOf[GzipCodec])
	}
	}