Related Setup: https://gist.github.com/hofmannsven/6814278
Related Pro Tips: https://ochronus.com/git-tips-from-the-trenches/
Related Setup: https://gist.github.com/hofmannsven/6814278
Related Pro Tips: https://ochronus.com/git-tips-from-the-trenches/
/** | |
* Get the stackexchange data from https://archive.org/details/stackexchange | |
* Data set used here : math.stackexchange.com | |
**/ | |
//Open the file. The text file is an RDD (Resilient Distributed Dataset) | |
//of Strings, which are the lines of the file. | |
val postXML = sc.textFile("Posts.xml") | |
//Count the lines. Note: Run twice and see the difference ;) |
- Format: 7zipped | |
- Files: | |
- **badges**.xml | |
- UserId, e.g.: "420" | |
- Name, e.g.: "Teacher" | |
- Date, e.g.: "2008-09-15T08:55:03.923" | |
- **comments**.xml | |
- Id | |
- PostId | |
- Score |
package org.aja.tej.tej.test.spark | |
/** | |
* Created by mageswaran on 9/8/15. | |
*/ | |
import java.util.Random | |
import org.apache.spark.{SparkConf, SparkContext} |
package org.aja.tej.examples | |
import java.io.File | |
import org.aja.tej.utils.TejUtils | |
import org.apache.spark.{SparkConf, SparkContext} | |
/** |
package org.aja.tej.examples.ml | |
import org.aja.tej.utils.TejUtils | |
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier | |
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator | |
import org.apache.spark.mllib.util.MLUtils | |
import org.apache.spark.sql.SQLContext | |
/** | |
* Created by mageswaran on 25/9/15. |
package org.aja.tej.tej.test.spark | |
/** | |
* Created by mageswaran on 9/8/15. | |
*/ | |
import java.util.Random | |
import org.apache.spark.{SparkConf, SparkContext} |
//For any updates check : https://github.com/Mageswaran1989/aja/blob/master/src/examples/scala/org/aja/tej/examples/streaming/twitter/TwitterWithNeo4j.scala | |
package org.aja.tej.examples.streaming.twitter | |
import com.google.gson.Gson | |
import org.aja.tej.utils.{TejUtils, TejTwitterUtils} | |
import org.anormcypher.{Cypher, Neo4jREST} | |
import org.apache.spark.sql.{AnalysisException, Row, SQLContext} | |
import org.apache.spark.streaming.twitter.TwitterUtils | |
import org.apache.spark.streaming.{Seconds, StreamingContext} | |
import play.api.libs.ws.ning |
package org.aja.tej.examples.streaming.twitter | |
import com.google.gson.Gson | |
import org.aja.tej.utils.{TejUtils, TejTwitterUtils} | |
import org.anormcypher.{Cypher, Neo4jREST} | |
import org.apache.spark.sql.{AnalysisException, Row, SQLContext} | |
import org.apache.spark.streaming.twitter.TwitterUtils | |
import org.apache.spark.streaming.{Seconds, StreamingContext} | |
import play.api.libs.ws.ning |
import os | |
import boto3 | |
from collections import defaultdict | |
import botocore | |
def get_matching_s3_objects(bucket, | |
aws_access_key_id, | |
aws_secret_access_key, | |
region_name, | |
prefix='', |