This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from pyspark.sql import SparkSession | |
# Import data types | |
from pyspark.sql.types import * | |
from pyspark.sql.functions import when, lit, col, udf | |
spark = SparkSession.builder.appName("Python spark read two files").getOrCreate() | |
accounts_file = sys.argv[1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.Tuple2; | |
import java.util.stream.Collectors; | |
//Drop fields with same name of expression | |
List<String> fieldNames = Arrays.asList(inputDF.columns()); | |
List<Tuple2<String, String>> fieldList = fieldNames.stream() | |
.filter(fieldName -> fieldName.trim().startsWith("__")) | |
.map(fieldName -> Tuple2.apply(fieldName, fieldName.substring(2))) | |
.filter(tuple2 -> fieldNames.contains(tuple2._2)) | |
.collect(Collectors.toList()); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Null out the columns specified in the meta data | |
* | |
* @param inputDataframe The input dataframe to apply nulling out on | |
* @param sparkSession An active spark session | |
* @param sourceEntity The source entity name. | |
* @param targetMetaData The target meta data object | |
* @return A dataframe after applying nulling out on fields specified | |
*/ | |
public static Dataset<Row> applyNullingOut(Dataset<Row> inputDataframe, SparkSession sparkSession, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To figure out duplicat values in a column and extract those rows | |
awk 'BEGIN { FS="," } { c[$2]++; l[$2,c[$2]]=$0 } END { for (i in c) { if (c[i] > 1) for (j = 1; j <= c[i]; j++) print l[i,j] } }' file.csv | |
# replace $2 to which ever column you want to look for duplicates | |
# Same above code with more comments | |
BEGIN { FS = ";" } | |
{ |
OlderNewer