Skip to content

Instantly share code, notes, and snippets.

@mvervuurt
mvervuurt / spark_pandas_dataframes.py
Last active March 16, 2020 13:55
Creating a PySpark DataFrame from a Pandas DataFrame
import pandas as pd
from pyspark.sql.types import *
#Create Pandas DataFrame
pd_person = pd.DataFrame({'PERSONID':'0','LASTNAME':'Doe','FIRSTNAME':'John','ADDRESS':'Museumplein','CITY':'Amsterdam'}, index=[0])
#Create PySpark DataFrame Schema
p_schema = StructType([StructField('ADDRESS',StringType(),True),StructField('CITY',StringType(),True),StructField('FIRSTNAME',StringType(),True),StructField('LASTNAME',StringType(),True),StructField('PERSONID',DecimalType(),True)])
#Create Spark DataFrame from Pandas
@mvervuurt
mvervuurt / TimeZoneUtils.java
Created September 23, 2016 20:23
Convert GMT UTC EpochSec to ZonedDateTime in chosen timezone
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
/**
* Timezone Conversion Utils between EpochSec Timestamps
* and other date formats
*/
public class TimeZoneUtils {
@mvervuurt
mvervuurt / TrySparkExcel.scala
Created February 22, 2017 13:54
Loading Excel file using Spark and saving its contents into a Hive table.
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
object TrySparkExcel extends App {
//Create Spark Session
val ss = SparkSession
.builder()
.master("local[*]")
@mvervuurt
mvervuurt / TeachScala.sc
Created February 22, 2017 13:57
Teaching basic scala concepts
/** Simple Base **/
abstract class Animal {
def hasNose : Boolean
def hasLegs : Boolean
def numLegs : Int
}
/** Example Traits **/
trait Mammal {
def isMammal : Unit = {
@mvervuurt
mvervuurt / SparkExcelLoadingUtils.scala
Created March 6, 2017 20:51
Spark Excel Loading Utils to Transform the DataFrame into DateFrame * that can be saved regular rows and columns in Hive
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
/**
* Spark Excel Loading Utils to Transform the DataFrame into DateFrame
* that can be saved regular rows and columns in Hive
*/
object SparkExcelLoadingUtils {
@mvervuurt
mvervuurt / ScalaPatternMatchingWithDates.sc
Created March 7, 2017 13:56
Scala Pattern Matching Exercise with Dates and regular expressions.
import java.time.LocalDate
val digitDate = """(\d\d)-(\d\d)""".r
val letterDate = """(\d\d)-([a-zA-Z]{3})""".r
def toStrHiveDateFrmt(str: String): String = str match {
case "" => "0001-01-01"
case digitDate(day, month) => {
val currentYear = LocalDate.now().getYear
val tmpDate = LocalDate.of(currentYear, Integer.valueOf(month), Integer.valueOf(day));
@mvervuurt
mvervuurt / TimeDiff.scala
Created August 9, 2017 12:10
TimeDiff based on OffsetDateTime
val endDateTimeOffset = endDateTime.getOffset.toString
//Create OffsetDateTime in chosen timezone with correct Offset
val startDateTime = OffsetDateTime.now(ZoneId.of(endDateTimeOffset))
val timeDiff = ChronoUnit.HOURS.between(startDateTime, endDateTime)
@mvervuurt
mvervuurt / custom_eval_metrics_xgboost.py
Created July 25, 2019 12:19
Custom Evaluation Metrics XGBoost: Precision and f1_score
def xgb_precision(proba_y: np.ndarray, dataset: xgb.DMatrix) -> Tuple[str, float]:
'''returns binary classification precision using 0.5 threshold.
proba_y: 1x2 shape or binary classification probabilities
dataset: xgboost DMatrix
'''
y = dataset.get_label()
tresh_func = np.vectorize(lambda x: 1 if x> 0.5 else 0)
pred_y = tresh_func(proba_y)
return 'clf_precision', precision_score(y, pred_y)
# set global figsize rcParams
from pylab import rcParams
rcParams['figure.figsize'] = 12,5
@mvervuurt
mvervuurt / sample_statsmodels.py
Created August 27, 2019 10:13
Statsmodels datasets and using sm datetools
import pandas as pd
import statsmodels.api as sm
df = sm.datasets.macrodata.load_pandas().data
df.index = pd.Index(sm.tsa.datetools.dates_from_range('1959Q1', '2009Q3'))
print(sm.datasets.macrodata.NOTE)