prasanthkothuri

## docker commands
# Build docker image, assumes Dockerfile is in the current directory
docker build -t <name>:<tag> .

# Get bash into docker image
docker run -it <image> /bin/bash

# Get bash into running container
docker exec -it <image> /bin/bash

# Tail the docker logs

## Spark2_CaseSensitivity
From Spark 2.0 onwards column names are no longer case sensitive in some scenarios, this can be demonstrated by the following example

**Spark 1.6**

-bash-4.2$ cat /tmp/sample.json
{"test": "first test", "key": "key1"}
{"Test": "second test", "key": "key2"}

scala> val jDF = sqlContext.read.json("/tmp/sample.json")
scala> jDF.printSchema

## get_active_namenode.py
#Determine the active namenode of HDFS, this is required as the webHDFS implementation doesn't redirect to active namenode

import os
import json
import xml.etree.ElementTree as ET
try:
    from urllib import urlopen
except ImportError:
    from urllib.request import urlopen

## hexsticker
imgurl <- "/Users/prasanth/logo_swan_letters.png"
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518,
        h_fill="#ffffff", h_color="#0053A1", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters.png")

imgurl <- "/Users/prasanth/logo_swan_letters.png"
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518,
        h_fill="#ffffff", h_color="#FB6700", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters_1.png")

imgurl <- "/Users/prasanth/logo_swan_letters.png"
sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518, url="https://swan.cern.ch", u_x=1.15, u_y=0.15, u_color="#0053A1",

## R_cern_contributions.R
# first foray into R, for data science and visualizations
# treemap of CERN contributions
url <- "https://www.dropbox.com/s/4g7m67xhubizoxe/cern_contributions.csv?dl=1"
CERN <- read.csv(url, header=TRUE, sep=",")
CERN$Contribution <- as.numeric(CERN$Contribution)
library(treemap)
treemap(CERN,
        index=c("Country"),
        vSize="Contribution",
        palette = "Set1",

## R_amzn_rev_shareprice
# read the csv into R dataframe
#url <- "/Users/prasanth/Desktop/amzn.csv"
url <- "https://www.dropbox.com/s/ffq7d5dnb47jzig/amzn.csv?dl=1"
amzn <- read.csv(url,header = TRUE, sep=",")

# Order the dataframe and inspect data
amzn <- amzn[order(amzn$year),]
head(amzn)

# Simple line graph with ggplot2

## hdfs commands
# set acl
hdfs dfs -setfacl -R -m user:{user_name}:rwx /path/to/the/directory
# find the quota
hdfs dfs -count -v -q -h /path/to/the/directory
# Set the space quota
hdfs dfsadmin -setSpaceQuota 500g /path/to/the/directory
# Set file / dir (inodes) quota
hdfs dfsadmin -setQuota 500000 /path/to/the/directory

## git_magic_commands.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                prasanthkothuri
                / git_magic_commands.md
            
            
              Last active
              May 31, 2023 06:21
            
              
                git magic commands
              
          
    squash the last X commits into a single commit

combine

git reset --soft HEAD~X  
git commit -m "combined commit message"

push

git push origin +name-of-branch


## useful_linux_commands.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                prasanthkothuri
                / useful_linux_commands.md
            
            
              Last active
              July 13, 2022 19:28
            
          
    find all large files in the root filesystem

find / -xdev -type f -size +100M
OR
find / -xdev -type f -size +100M -exec ls -la {} \; | sort -nk 5
OR
find / -xdev -type f -size +100M -exec du -sh {} ';' | sort -rh | head -n50

install netstat on centos 7


## spark_optimization.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                prasanthkothuri
                / spark_optimization.md
            
            
              Last active
              April 11, 2019 11:23
            
          
    Reading json into Spark Dataframe
method 1 (efficient, specify the schema on construction the dataframe)
from pyspark.sql.types import *
schema = StructType([StructField('aggregated', StringType(), True),
                     StructField('body', StringType(), True),
                     StructField('entity', StringType(), True),
                     StructField('metric_id', StringType(), True),
	# Build docker image, assumes Dockerfile is in the current directory
	docker build -t <name>:<tag> .

	# Get bash into docker image
	docker run -it <image> /bin/bash

	# Get bash into running container
	docker exec -it <image> /bin/bash

	# Tail the docker logs
	From Spark 2.0 onwards column names are no longer case sensitive in some scenarios, this can be demonstrated by the following example

	Spark 1.6

	-bash-4.2$ cat /tmp/sample.json
	{"test": "first test", "key": "key1"}
	{"Test": "second test", "key": "key2"}

	scala> val jDF = sqlContext.read.json("/tmp/sample.json")
	scala> jDF.printSchema
	#Determine the active namenode of HDFS, this is required as the webHDFS implementation doesn't redirect to active namenode

	import os
	import json
	import xml.etree.ElementTree as ET
	try:
	from urllib import urlopen
	except ImportError:
	from urllib.request import urlopen
	imgurl <- "/Users/prasanth/logo_swan_letters.png"
	sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518,
	h_fill="#ffffff", h_color="#0053A1", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters.png")

	imgurl <- "/Users/prasanth/logo_swan_letters.png"
	sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518,
	h_fill="#ffffff", h_color="#FB6700", h_size=1, filename="/Users/prasanth/hex_logo_swan_letters_1.png")

	imgurl <- "/Users/prasanth/logo_swan_letters.png"
	sticker(imgurl, package="", s_x=1, s_y=1, s_width=.6, s_height=.518, url="https://swan.cern.ch", u_x=1.15, u_y=0.15, u_color="#0053A1",
	# first foray into R, for data science and visualizations
	# treemap of CERN contributions
	url <- "https://www.dropbox.com/s/4g7m67xhubizoxe/cern_contributions.csv?dl=1"
	CERN <- read.csv(url, header=TRUE, sep=",")
	CERN$Contribution <- as.numeric(CERN$Contribution)
	library(treemap)
	treemap(CERN,
	index=c("Country"),
	vSize="Contribution",
	palette = "Set1",
	# read the csv into R dataframe
	#url <- "/Users/prasanth/Desktop/amzn.csv"
	url <- "https://www.dropbox.com/s/ffq7d5dnb47jzig/amzn.csv?dl=1"
	amzn <- read.csv(url,header = TRUE, sep=",")

	# Order the dataframe and inspect data
	amzn <- amzn[order(amzn$year),]
	head(amzn)

	# Simple line graph with ggplot2
	# set acl
	hdfs dfs -setfacl -R -m user:{user_name}:rwx /path/to/the/directory
	# find the quota
	hdfs dfs -count -v -q -h /path/to/the/directory
	# Set the space quota
	hdfs dfsadmin -setSpaceQuota 500g /path/to/the/directory
	# Set file / dir (inodes) quota
	hdfs dfsadmin -setQuota 500000 /path/to/the/directory