git reset --soft HEAD~X
git commit -m "combined commit message"
git push origin +name-of-branch
# Build docker image, assumes Dockerfile is in the current directory | |
docker build -t <name>:<tag> . | |
# Get bash into docker image | |
docker run -it <image> /bin/bash | |
# Get bash into running container | |
docker exec -it <image> /bin/bash | |
# Tail the docker logs |
From Spark 2.0 onwards column names are no longer case sensitive in some scenarios, this can be demonstrated by the following example | |
**Spark 1.6** | |
-bash-4.2$ cat /tmp/sample.json | |
{"test": "first test", "key": "key1"} | |
{"Test": "second test", "key": "key2"} | |
scala> val jDF = sqlContext.read.json("/tmp/sample.json") | |
scala> jDF.printSchema |
#Determine the active namenode of HDFS, this is required as the webHDFS implementation doesn't redirect to active namenode | |
import os | |
import json | |
import xml.etree.ElementTree as ET | |
try: | |
from urllib import urlopen | |
except ImportError: | |
from urllib.request import urlopen |
imgurl <- "/Users/prasanth/logo_swan_letters.png" | |
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518, | |
h_fill="#ffffff", h_color="#0053A1", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters.png") | |
imgurl <- "/Users/prasanth/logo_swan_letters.png" | |
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518, | |
h_fill="#ffffff", h_color="#FB6700", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters_1.png") | |
imgurl <- "/Users/prasanth/logo_swan_letters.png" | |
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518, url="https://swan.cern.ch", u_x=1.15, u_y=0.15, u_color="#0053A1", |
# first foray into R, for data science and visualizations | |
# treemap of CERN contributions | |
url <- "https://www.dropbox.com/s/4g7m67xhubizoxe/cern_contributions.csv?dl=1" | |
CERN <- read.csv(url, header=TRUE, sep=",") | |
CERN$Contribution <- as.numeric(CERN$Contribution) | |
library(treemap) | |
treemap(CERN, | |
index=c("Country"), | |
vSize="Contribution", | |
palette = "Set1", |
# set acl | |
hdfs dfs -setfacl -R -m user:{user_name}:rwx /path/to/the/directory | |
# find the quota | |
hdfs dfs -count -v -q -h /path/to/the/directory | |
# Set the space quota | |
hdfs dfsadmin -setSpaceQuota 500g /path/to/the/directory | |
# Set file / dir (inodes) quota | |
hdfs dfsadmin -setQuota 500000 /path/to/the/directory |
Reading json into Spark Dataframe
method 1 (efficient, specify the schema on construction the dataframe)
from pyspark.sql.types import *
schema = StructType([StructField('aggregated', StringType(), True),
StructField('body', StringType(), True),
StructField('entity', StringType(), True),
StructField('metric_id', StringType(), True),