Skip to content

Instantly share code, notes, and snippets.

#!/usr/bin/env bash
export SPARK_MAJOR_VERSION=2
/usr/hdp/current/spark2-client/bin/spark-submit --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
--files /path/to/log4j.properties \
--conf spark.yarn.executor.memoryOverhead=1024 \
--conf spark.port.maxRetries=64 \
--conf spark.driver.extraJavaOptions='-Dlog4j.debug -Dlog4j.configuration=file:/path/to/log4j.properties -Da=a1' \
--conf spark.executor.extraJavaOptions='-Dlog4j.debug -Dlog4j.configuration=log4j.properties' \
--master yarn \
<?xml version="1.0" encoding="UTF-8"?>
<ruleset xmlns="http://mojo.codehaus.org/versions-maven-plugin/rule/2.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" comparisonMethod="maven" xsi:schemaLocation="http://mojo.codehaus.org/versions-maven-plugin/rule/2.0.0 http://mojo.codehaus.org/versions-maven-plugin/xsd/rule-2.0.0.xsd">
<ignoreVersions>
<!-- Ignore Alpha's, Beta's, release candidates and milestones -->
<ignoreVersion type="regex">(?i).*Alpha(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*Beta(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*-B(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*RC(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*CR(?:-?\d+)?</ignoreVersion>
<ignoreVersion type="regex">(?i).*M(?:-?\d+)?</ignoreVersion>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>versions-maven-plugin</artifactId>
<version>2.3</version>
<configuration>
<rulesUri>file:///${project.basedir}/versions-maven-rules.xml</rulesUri>
</configuration>
<executions>
<execution>
<phase>compile</phase>
from datetime import datetime, timedelta
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
default_args = {
'owner': 'myowner',
'depends_on_past': False,
'start_date': datetime(year=2017, month=10, day=18, hour=0, minute=0),
'email': ['me@example.com'],
'email_on_failure': True,
from datetime import datetime, timedelta
from airflow import DAG
from airflow import utils
from airflow.operators import BashOperator, EmailOperator, DummyOperator
default_args = {
'owner': 'myowner',
'depends_on_past': False,
'start_date': datetime(year=2017, month=10, day=18, hour=0, minute=0),
'email': ['me@example.com'],
# dump messages to stdout, uses old consumer api!
kafka-console-consumer --zookeeper localhost:2181 --topic my_topic --from-beginning
#
kafka-topics --zookeeper localhost:2181 --describe --topic my_topic
# alter topic
@seahrh
seahrh / easypipe.py
Created May 1, 2018 03:42 — forked from dannguyen/easypipe.py
Using scikit-learn to classify NYT columnists
# some convenience functions here, nothing new
'''
# usage:
from easypipe import easy_pipeline
from easypipe import print_metrics
data_folder = "data-hold/20news"
p = easy_pipeline()
print_metrics(p, data_folder)
'''
@seahrh
seahrh / easypipe.py
Created May 1, 2018 03:42 — forked from dannguyen/easypipe.py
Using scikit-learn to classify NYT columnists
# some convenience functions here, nothing new
'''
# usage:
from easypipe import easy_pipeline
from easypipe import print_metrics
data_folder = "data-hold/20news"
p = easy_pipeline()
print_metrics(p, data_folder)
'''
# Shuffle dataframe
cities.reindex(np.random.permutation(cities.index))
# Read data from Google Cloud Storage
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
# Convert pandas data into a dict of np arrays
# where `key` is column name.
@seahrh
seahrh / tf_model_size.py
Created June 8, 2018 06:08
Get model size after L1 regularization
# Get model size after L1 regularization
# see: https://colab.research.google.com/notebooks/mlcc/sparsity_and_l1_regularization.ipynb?utm_source=mlcc&utm_campaign=colab-external&utm_medium=referral&utm_content=l1regularization-colab&hl=en#scrollTo=e6GfTI0CFhB8
def model_size(estimator):
variables = estimator.get_variable_names()
size = 0
for variable in variables:
if not any(x in variable
for x in ['global_step',
'centered_bias_weight',