Skip to content

Instantly share code, notes, and snippets.

@JoshRosen
Created July 18, 2013 05:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JoshRosen/6026985 to your computer and use it in GitHub Desktop.
Save JoshRosen/6026985 to your computer and use it in GitHub Desktop.
Methods missing from the Java API in Spark 0.7.3. This list may contain a few false-positives due to the automated script for finding the missing methods.
Missing RDD methods
spark.api.java.JavaRDD<T> filter(spark.api.java.function.Function<T, java.lang.Object>)
spark.api.java.JavaPairRDD<T, U> zip(spark.api.java.JavaRDD<U>)
void foreachPartition(spark.api.java.function.VoidFunction<java.util.Iterator<T>>)
void foreachWith(spark.api.java.function.Function<java.lang.Object, A>, spark.api.java.function.Function2<T, A, scala.runtime.BoxedUnit>)
spark.api.java.JavaRDD<U> mapPartitions(spark.api.java.function.FlatMapFunction<java.util.Iterator<T>, U>, boolean)
java.lang.Object take(int)
spark.partial.PartialResult<spark.partial.BoundedDouble> countApprox(long, java.lang.Double)
java.lang.Object collect()
spark.api.java.JavaRDD<U> flatMapWith(spark.api.java.function.Function<java.lang.Object, A>, boolean, spark.api.java.function.Function2<T, A, java.util.List<U>>)
spark.api.java.JavaRDD<U> mapPartitionsWithSplit(spark.api.java.function.Function2<java.lang.Object, java.util.Iterator<T>, java.util.Iterator<U>>, boolean)
spark.api.java.JavaRDD<U> collect(scala.PartialFunction<T, U>)
spark.api.java.JavaRDD<java.lang.Object> glom()
spark.api.java.JavaRDD<U> mapWith(spark.api.java.function.Function<java.lang.Object, A>, boolean, spark.api.java.function.Function2<T, A, U>)
java.lang.Object takeSample(boolean, int, int)
spark.api.java.JavaPairRDD<T, U> cartesian(spark.api.java.JavaRDD<U>)
java.lang.String getCheckpointFile()
spark.api.java.JavaPairRDD<K, java.util.List<T>> groupBy(spark.api.java.function.Function<T, K>, spark.Partitioner)
spark.api.java.JavaRDD<T> filterWith(spark.api.java.function.Function<java.lang.Object, A>, spark.api.java.function.Function2<T, A, java.lang.Object>)
java.lang.String name()
spark.api.java.JavaRDD<T> setName(java.lang.String)
spark.api.java.JavaRDD<T> sample(boolean, java.lang.Double, int)
java.lang.Object toArray()
java.util.Iterator<T> compute(spark.Partition, spark.TaskContext)
spark.api.java.JavaRDD<U> mapPartitionsWithIndex(spark.api.java.function.Function2<java.lang.Object, java.util.Iterator<T>, java.util.Iterator<U>>, boolean)
java.util.Map<T, java.lang.Object> countByValue()
spark.api.java.JavaRDD<U> map(spark.api.java.function.Function<T, U>)
spark.partial.PartialResult<java.util.Map<T, spark.partial.BoundedDouble>> countByValueApprox(long, java.lang.Double)
java.util.List<java.lang.String> preferredLocations(spark.Partition)
spark.api.java.JavaRDD<T> persist()
Missing PairRDD methods
void saveAsNewAPIHadoopFile(java.lang.String, java.lang.Class<?>, java.lang.Class<?>, java.lang.Class<? extends org.apache.hadoop.mapreduce.OutputFormat<?, ?>>, org.apache.hadoop.conf.Configuration)
void saveAsHadoopFile(java.lang.String, java.lang.Class<?>, java.lang.Class<?>, java.lang.Class<? extends org.apache.hadoop.mapred.OutputFormat<?, ?>>, org.apache.hadoop.mapred.JobConf)
spark.api.java.JavaPairRDD<K, scala.Tuple2<V, W>> leftOuterJoin(spark.api.java.JavaPairRDD<K, W>, spark.Partitioner)
spark.api.java.JavaPairRDD<K, C> combineByKey(spark.api.java.function.Function<V, C>, spark.api.java.function.Function2<C, V, C>, spark.api.java.function.Function2<C, C, C>, spark.Partitioner, boolean)
spark.api.java.JavaPairRDD<K, scala.Tuple2<V, W>> leftOuterJoin(spark.api.java.JavaPairRDD<K, W>, int)
spark.api.java.JavaPairRDD<K, V> partitionBy(spark.Partitioner, boolean)
void saveAsHadoopFile(java.lang.String)
spark.api.java.JavaPairRDD<K, scala.Tuple2<V, W>> leftOuterJoin(spark.api.java.JavaPairRDD<K, W>)
spark.api.java.JavaPairRDD<K, scala.Tuple2<V, W>> rightOuterJoin(spark.api.java.JavaPairRDD<K, W>)
spark.partial.PartialResult<java.util.Map<K, spark.partial.BoundedDouble>> countByKeyApprox(long, java.lang.Double)
spark.api.java.JavaPairRDD<K, V> subtractByKey(spark.api.java.JavaPairRDD<K, W>)
void saveAsNewAPIHadoopFile(java.lang.String)
spark.api.java.JavaPairRDD<K, V> subtractByKey(spark.api.java.JavaPairRDD<K, W>, int)
spark.api.java.JavaPairRDD<K, scala.Tuple2<V, W>> rightOuterJoin(spark.api.java.JavaPairRDD<K, W>, spark.Partitioner)
spark.api.java.JavaPairRDD<K, U> flatMapValues(spark.api.java.function.FlatMapFunction<V, U>)
spark.api.java.JavaPairRDD<K, V> subtractByKey(spark.api.java.JavaPairRDD<K, W>, spark.Partitioner)
java.util.Map<K, V> reduceByKeyToDriver(spark.api.java.function.Function2<V, V, V>)
spark.api.java.JavaPairRDD<K, scala.Tuple2<V, W>> rightOuterJoin(spark.api.java.JavaPairRDD<K, W>, int)
Missing DoubleRDD methods
java.lang.Double sampleStdev()
Missing OrderedRDD methods
spark.api.java.JavaPairRDD<K, V> sortByKey(boolean, int)
Missing SparkContext methods
spark.api.java.JavaPairRDD<K, V> newAPIHadoopFile(java.lang.String)
java.util.List<java.lang.String> jars()
spark.api.java.JavaPairRDD<K, V> hadoopRDD(org.apache.hadoop.mapred.JobConf, java.lang.Class<? extends org.apache.hadoop.mapred.InputFormat<K, V>>, java.lang.Class<K>, java.lang.Class<V>, int)
void addSparkListener(spark.scheduler.SparkListener)
spark.api.java.JavaPairRDD<K, V> hadoopFile(java.lang.String)
int defaultParallelism()
java.util.Map<java.lang.String, java.lang.String> environment()
int defaultMinSplits()
spark.api.java.JavaPairRDD<K, V> sequenceFile(java.lang.String, int, scala.Function0<spark.WritableConverter<K>>, scala.Function0<spark.WritableConverter<V>>)
java.util.List<java.lang.String> jarOfObject(java.lang.Object)
java.lang.String appName()
spark.Accumulable<R, T> accumulableCollection(R, spark.api.java.function.Function<R, scala.collection.generic.Growable<T>>)
java.lang.String master()
java.lang.String sparkHome()
spark.api.java.JavaPairRDD<K, V> hadoopFile(java.lang.String, int)
spark.api.java.JavaPairRDD<K, V> hadoopFile(java.lang.String, java.lang.Class<? extends org.apache.hadoop.mapred.InputFormat<K, V>>, java.lang.Class<K>, java.lang.Class<V>, int)
java.util.List<java.lang.String> jarOfClass(java.lang.Class<?>)
spark.api.java.JavaRDD<T> union(java.util.List<spark.api.java.JavaRDD<T>>)
java.util.Map<java.lang.String, scala.Tuple2<java.lang.Object, java.lang.Object>> getExecutorMemoryStatus()
java.util.Map<spark.scheduler.Stage, spark.scheduler.StageInfo> getStageInfo()
Missing StreamingContext methods
spark.streaming.PairDStreamFunctions<K, V> toPairDStreamFunctions(spark.streaming.api.java.JavaPairDStream<K, V>)
spark.streaming.api.java.JavaDStream<twitter4j.Status> twitterStream(twitter4j.auth.Authorization, java.util.List<java.lang.String>, spark.storage.StorageLevel)
spark.streaming.api.java.JavaPairDStream<K, V> fileStream(java.lang.String, spark.api.java.function.Function<org.apache.hadoop.fs.Path, java.lang.Object>, boolean)
spark.streaming.api.java.JavaDStream<T> union(java.util.List<spark.streaming.api.java.JavaDStream<T>>)
spark.streaming.api.java.JavaDStream<T> zeroMQStream(java.lang.String, akka.zeromq.Subscribe, spark.api.java.function.FlatMapFunction<java.util.List<java.util.List<java.lang.Object>>, T>, spark.storage.StorageLevel, akka.actor.SupervisorStrategy)
spark.streaming.api.java.JavaDStream<T> networkStream(spark.streaming.dstream.NetworkReceiver<T>)
void registerOutputStream(spark.streaming.api.java.JavaDStream<?>)
spark.SparkContext sc()
spark.streaming.api.java.JavaDStream<T> socketStream(java.lang.String, int, spark.api.java.function.FlatMapFunction<java.io.InputStream, T>, spark.storage.StorageLevel)
spark.SparkContext sparkContext()
spark.streaming.input.KafkaFunctions toKafkaFunctions(spark.streaming.StreamingContext)
void registerInputStream(spark.streaming.dstream.InputDStream<?>)
Missing DStream methods
void saveAsTextFiles(java.lang.String, java.lang.String)
spark.streaming.api.java.JavaDStream<U> map(spark.api.java.function.Function<T, U>)
spark.streaming.api.java.JavaDStream<java.lang.Object> count()
java.util.List<spark.api.java.JavaRDD<T>> slice(spark.streaming.Interval)
void foreach(spark.api.java.function.Function2<spark.api.java.JavaRDD<T>, spark.streaming.Time, scala.runtime.BoxedUnit>)
void saveAsObjectFiles(java.lang.String, java.lang.String)
spark.streaming.Duration slideDuration()
spark.streaming.api.java.JavaPairDStream<T, java.lang.Object> countByValueAndWindow(spark.streaming.Duration, spark.streaming.Duration, int)
spark.streaming.api.java.JavaDStream<T> filter(spark.api.java.function.Function<T, java.lang.Object>)
spark.streaming.api.java.JavaDStream<T> checkpoint(spark.streaming.Duration)
spark.streaming.api.java.JavaDStream<java.lang.Object> countByWindow(spark.streaming.Duration, spark.streaming.Duration)
spark.streaming.api.java.JavaDStream<java.lang.Object> glom()
spark.streaming.api.java.JavaDStream<U> mapPartitions(spark.api.java.function.FlatMapFunction<java.util.Iterator<T>, U>, boolean)
scala.collection.immutable.List<spark.streaming.api.java.JavaDStream<?>> dependencies()
spark.streaming.api.java.JavaPairDStream<T, java.lang.Object> countByValue(int)
spark.streaming.api.java.JavaDStream<T> reduceByWindow(spark.api.java.function.Function2<T, T, T>, spark.streaming.Duration, spark.streaming.Duration)
spark.streaming.StreamingContext ssc()
void register()
void foreach(spark.api.java.function.VoidFunction<spark.api.java.JavaRDD<T>>)
Missing PairDStream methods
spark.streaming.api.java.JavaPairDStream<K, V> reduceByKeyAndWindow(spark.api.java.function.Function2<V, V, V>, spark.streaming.Duration, spark.streaming.Duration)
spark.streaming.api.java.JavaPairDStream<K, V> reduceByKeyAndWindow(spark.api.java.function.Function2<V, V, V>, spark.api.java.function.Function2<V, V, V>, spark.streaming.Duration, spark.streaming.Duration, int, spark.api.java.function.Function<scala.Tuple2<K, V>, java.lang.Object>)
spark.streaming.api.java.JavaPairDStream<K, S> updateStateByKey(spark.api.java.function.Function2<java.util.List<V>, S, S>)
spark.streaming.api.java.JavaPairDStream<K, S> updateStateByKey(spark.api.java.function.Function2<java.util.List<V>, S, S>, int)
spark.HashPartitioner defaultPartitioner(int)
spark.streaming.api.java.JavaPairDStream<K, S> updateStateByKey(spark.api.java.function.FlatMapFunction<java.util.Iterator<scala.Tuple3<K, java.util.List<V>, S>>, scala.Tuple2<K, S>>, spark.Partitioner, boolean)
spark.streaming.api.java.JavaPairDStream<K, V> reduceByKeyAndWindow(spark.api.java.function.Function2<V, V, V>, spark.streaming.Duration)
spark.streaming.api.java.JavaPairDStream<K, V> reduceByKeyAndWindow(spark.api.java.function.Function2<V, V, V>, spark.streaming.Duration, spark.streaming.Duration, spark.Partitioner)
spark.streaming.api.java.JavaPairDStream<K, S> updateStateByKey(spark.api.java.function.Function2<java.util.List<V>, S, S>, spark.Partitioner)
spark.streaming.api.java.JavaPairDStream<K, V> reduceByKeyAndWindow(spark.api.java.function.Function2<V, V, V>, spark.api.java.function.Function2<V, V, V>, spark.streaming.Duration, spark.streaming.Duration, spark.Partitioner, spark.api.java.function.Function<scala.Tuple2<K, V>, java.lang.Object>)
spark.streaming.StreamingContext ssc()
spark.streaming.api.java.JavaPairDStream<K, V> reduceByKeyAndWindow(spark.api.java.function.Function2<V, V, V>, spark.streaming.Duration, spark.streaming.Duration, int)
spark.streaming.api.java.JavaPairDStream<K, U> flatMapValues(spark.api.java.function.FlatMapFunction<V, U>)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment