# Build docker image, assumes Dockerfile is in the current directory
docker build -t <name>:<tag> .
# Get bash into docker image
docker run -it <image> /bin/bash
# Get bash into running container
docker exec -it <image> /bin/bash
# Tail the docker logs
From Spark 2.0 onwards column names are no longer case sensitive in some scenarios, this can be demonstrated by the following example
**Spark 1.6**
-bash-4.2$ cat /tmp/sample.json
{"test": "first test", "key": "key1"}
{"Test": "second test", "key": "key2"}
scala> val jDF ="/tmp/sample.json")
scala> jDF.printSchema
#Determine the active namenode of HDFS, this is required as the webHDFS implementation doesn't redirect to active namenode
import os
import json
import xml.etree.ElementTree as ET
from urllib import urlopen
except ImportError:
from urllib.request import urlopen
imgurl <- "/Users/prasanth/logo_swan_letters.png"
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518,
h_fill="#ffffff", h_color="#0053A1", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters.png")
imgurl <- "/Users/prasanth/logo_swan_letters.png"
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518,
h_fill="#ffffff", h_color="#FB6700", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters_1.png")
imgurl <- "/Users/prasanth/logo_swan_letters.png"
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518, url="", u_x=1.15, u_y=0.15, u_color="#0053A1",
# first foray into R, for data science and visualizations
# treemap of CERN contributions
url <- ""
CERN <- read.csv(url, header=TRUE, sep=",")
CERN$Contribution <- as.numeric(CERN$Contribution)
palette = "Set1",
# read the csv into R dataframe
#url <- "/Users/prasanth/Desktop/amzn.csv"
url <- ""
amzn <- read.csv(url,header = TRUE, sep=",")
# Order the dataframe and inspect data
amzn <- amzn[order(amzn$year),]
# Simple line graph with ggplot2
# set acl
hdfs dfs -setfacl -R -m user:{user_name}:rwx /path/to/the/directory
# find the quota
hdfs dfs -count -v -q -h /path/to/the/directory
# Set the space quota
hdfs dfsadmin -setSpaceQuota 500g /path/to/the/directory
# Set file / dir (inodes) quota
hdfs dfsadmin -setQuota 500000 /path/to/the/directory
git magic commands

squash the last X commits into a single commit

git reset --soft HEAD~X  
git commit -m "combined commit message"
git push origin +name-of-branch

find all large files in the root filesystem

find / -xdev -type f -size +100M
find / -xdev -type f -size +100M -exec ls -la {} \; | sort -nk 5
find / -xdev -type f -size +100M -exec du -sh {} ';' | sort -rh | head -n50

install netstat on centos 7

Reading json into Spark Dataframe

method 1 (efficient, specify the schema on construction the dataframe)

from pyspark.sql.types import *
schema = StructType([StructField('aggregated', StringType(), True),
                     StructField('body', StringType(), True),
                     StructField('entity', StringType(), True),
                     StructField('metric_id', StringType(), True),