Jai Prakash prakashrd

## bash_one_liners.sh
# To figure out duplicat values in a column and extract those rows
awk 'BEGIN { FS="," } { c[$2]++; l[$2,c[$2]]=$0 } END { for (i in c) { if (c[i] > 1) for (j = 1; j <= c[i]; j++) print l[i,j] } }' file.csv

# replace $2 to which ever column you want to look for duplicates


# Same above code with more comments
BEGIN { FS = ";" }

{

## java_udf
    /**
     * Null out the columns specified in the meta data
     *
     * @param inputDataframe  The input dataframe to apply nulling out on
     * @param sparkSession   An active spark session
     * @param sourceEntity   The source entity name.
     * @param targetMetaData The target meta data object
     * @return A dataframe after applying nulling out on fields specified
     */
    public static Dataset<Row> applyNullingOut(Dataset<Row> inputDataframe, SparkSession sparkSession,

## scala_java8.java
import scala.Tuple2;
import java.util.stream.Collectors;

//Drop fields with same name of expression
List<String> fieldNames = Arrays.asList(inputDF.columns());
List<Tuple2<String, String>> fieldList = fieldNames.stream()
    .filter(fieldName -> fieldName.trim().startsWith("__"))
    .map(fieldName -> Tuple2.apply(fieldName, fieldName.substring(2)))
    .filter(tuple2 -> fieldNames.contains(tuple2._2))
     .collect(Collectors.toList());

## pyspark_two_files.py
  import sys
  from pyspark.sql import SparkSession
  # Import data types
  from pyspark.sql.types import *
  from pyspark.sql.functions import when, lit, col, udf


  spark = SparkSession.builder.appName("Python spark read two files").getOrCreate()

  accounts_file = sys.argv[1]

## spark-join.scala
scala> val left = Seq((0), (1)).toDF("id")
left: org.apache.spark.sql.DataFrame = [id: int]

scala> left.join(right, "id").show
+---+-----+
| id|right|
+---+-----+
|  0| zero|
|  0| four|
+---+-----+

## regex.scala
scala> val s = """(\d+)-(\d+)-(\d+).*""".r
s: scala.util.matching.Regex = (\d+)-(\d+)-(\d+).*

scala> val s(a,b,c) = "20-30-04 jfa"
a: String = 20
b: String = 30
c: String = 04

## useful_pandas_snippets.py
# List unique values in a DataFrame column
# h/t @makmanalp for the updated syntax!
df['Column Name'].unique()

# Convert Series datatype to numeric (will error if column has non-numeric values)
# h/t @makmanalp
pd.to_numeric(df['Column Name'])

# Convert Series datatype to numeric, changing non-numeric values to NaN
# h/t @makmanalp for the updated syntax!

## squash_commits_after_push.sh
git checkout my_branch
git reset --soft HEAD~4
git commit
git push --force origin my_branch

## The above resets four commits you have pushed. Though it can be done any branch but doing it on feature branch is a
## good practise

## read_encoded_file.py
import codecs
import json

json_data = json.load(codecs.open('url_entities.json', 'r', 'utf-16'))
json_rows = [r for r in json_data]

## 00-OozieWorkflowShellAction
This gist includes components of a oozie workflow - scripts/code, sample data
and commands;  Oozie actions covered: shell action, email action

Action 1: The shell action executes a shell script that does a line count for files in a
glob provided, and writes the line count to standard output
Action 2: The email action emails the output of action 1


Pictorial overview of job:
--------------------------
	# To figure out duplicat values in a column and extract those rows
	awk 'BEGIN { FS="," } { c[$2]++; l[$2,c[$2]]=$0 } END { for (i in c) { if (c[i] > 1) for (j = 1; j <= c[i]; j++) print l[i,j] } }' file.csv

	# replace $2 to which ever column you want to look for duplicates


	# Same above code with more comments
	BEGIN { FS = ";" }

	{
	/**
	* Null out the columns specified in the meta data
	*
	* @param inputDataframe The input dataframe to apply nulling out on
	* @param sparkSession An active spark session
	* @param sourceEntity The source entity name.
	* @param targetMetaData The target meta data object
	* @return A dataframe after applying nulling out on fields specified
	*/
	public static Dataset<Row> applyNullingOut(Dataset<Row> inputDataframe, SparkSession sparkSession,
	import scala.Tuple2;
	import java.util.stream.Collectors;

	//Drop fields with same name of expression
	List<String> fieldNames = Arrays.asList(inputDF.columns());
	List<Tuple2<String, String>> fieldList = fieldNames.stream()
	.filter(fieldName -> fieldName.trim().startsWith("__"))
	.map(fieldName -> Tuple2.apply(fieldName, fieldName.substring(2)))
	.filter(tuple2 -> fieldNames.contains(tuple2._2))
	.collect(Collectors.toList());
	import sys
	from pyspark.sql import SparkSession
	# Import data types
	from pyspark.sql.types import *
	from pyspark.sql.functions import when, lit, col, udf


	spark = SparkSession.builder.appName("Python spark read two files").getOrCreate()

	accounts_file = sys.argv[1]
	scala> val left = Seq((0), (1)).toDF("id")
	left: org.apache.spark.sql.DataFrame = [id: int]

	scala> left.join(right, "id").show
	+---+-----+
	\| id\|right\|
	+---+-----+
	\| 0\| zero\|
	\| 0\| four\|
	+---+-----+
	scala> val s = """(\d+)-(\d+)-(\d+).*""".r
	s: scala.util.matching.Regex = (\d+)-(\d+)-(\d+).*

	scala> val s(a,b,c) = "20-30-04 jfa"
	a: String = 20
	b: String = 30
	c: String = 04
	# List unique values in a DataFrame column
	# h/t @makmanalp for the updated syntax!
	df['Column Name'].unique()

	# Convert Series datatype to numeric (will error if column has non-numeric values)
	# h/t @makmanalp
	pd.to_numeric(df['Column Name'])

	# Convert Series datatype to numeric, changing non-numeric values to NaN
	# h/t @makmanalp for the updated syntax!
	git checkout my_branch
	git reset --soft HEAD~4
	git commit
	git push --force origin my_branch

	## The above resets four commits you have pushed. Though it can be done any branch but doing it on feature branch is a
	## good practise
	import codecs
	import json

	json_data = json.load(codecs.open('url_entities.json', 'r', 'utf-16'))
	json_rows = [r for r in json_data]
	This gist includes components of a oozie workflow - scripts/code, sample data
	and commands; Oozie actions covered: shell action, email action

	Action 1: The shell action executes a shell script that does a line count for files in a
	glob provided, and writes the line count to standard output
	Action 2: The email action emails the output of action 1


	Pictorial overview of job:
	--------------------------