python -m SimpleHTTPServer 8000
localhost:8000
from the browser
- install npm module
npm install http-server -g
# A simple script to convert traffic csv to parquet file. Demonstrates the usage of csv to parquet, usage of udfs and applying the | |
import argparse | |
from pyspark.sql import SparkSession | |
# Import data types | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import when, lit, col, udf | |
def convert_csv_to_parquet(spark_context, custom_schema, csv_file, parquet_file): |
# having started spark journey everything is a discovery for me so jolting few notes | |
df = sqlContext.createDataFrame([{'name': 'Alice', 'age': 1, 'gender' : 'F'}]) | |
# display all the columns | |
df.show() | |
# limit to few | |
df.select('name', 'age').show() |
This gist includes components of a oozie workflow - scripts/code, sample data | |
and commands; Oozie actions covered: shell action, email action | |
Action 1: The shell action executes a shell script that does a line count for files in a | |
glob provided, and writes the line count to standard output | |
Action 2: The email action emails the output of action 1 | |
Pictorial overview of job: | |
-------------------------- |
import codecs | |
import json | |
json_data = json.load(codecs.open('url_entities.json', 'r', 'utf-16')) | |
json_rows = [r for r in json_data] |
# List unique values in a DataFrame column | |
# h/t @makmanalp for the updated syntax! | |
df['Column Name'].unique() | |
# Convert Series datatype to numeric (will error if column has non-numeric values) | |
# h/t @makmanalp | |
pd.to_numeric(df['Column Name']) | |
# Convert Series datatype to numeric, changing non-numeric values to NaN | |
# h/t @makmanalp for the updated syntax! |
git checkout my_branch | |
git reset --soft HEAD~4 | |
git commit | |
git push --force origin my_branch | |
## The above resets four commits you have pushed. Though it can be done any branch but doing it on feature branch is a | |
## good practise |
scala> val s = """(\d+)-(\d+)-(\d+).*""".r | |
s: scala.util.matching.Regex = (\d+)-(\d+)-(\d+).* | |
scala> val s(a,b,c) = "20-30-04 jfa" | |
a: String = 20 | |
b: String = 30 | |
c: String = 04 |
scala> val left = Seq((0), (1)).toDF("id") | |
left: org.apache.spark.sql.DataFrame = [id: int] | |
scala> left.join(right, "id").show | |
+---+-----+ | |
| id|right| | |
+---+-----+ | |
| 0| zero| | |
| 0| four| | |
+---+-----+ |
import sys | |
from pyspark.sql import SparkSession | |
# Import data types | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import when, lit, col, udf | |
spark = SparkSession.builder.appName("Python spark read two files").getOrCreate() | |
accounts_file = sys.argv[1] |