Skip to content

Instantly share code, notes, and snippets.

@t3rmin4t0r
t3rmin4t0r / join-test.py
Last active August 29, 2015 14:27
join query generator for testing
import itertools
from random import shuffle
def permutations(l):
result = []
for i in xrange(len(l)+1):
result += list(itertools.permutations(l,i))
return result
@t3rmin4t0r
t3rmin4t0r / ats-plan-fetcher.py
Last active April 20, 2016 05:04
Hive ATS Query plan fetcher
import os,sys,re,math,os.path
from collections import defaultdict
from itertools import groupby
from bz2 import BZ2File
from gzip import GzipFile as GZFile
import getopt
from json import loads as json_parse
from json import dumps as json_print
from md5 import md5 as md5_hash
from xml.dom.minidom import parse as xmlparse
@t3rmin4t0r
t3rmin4t0r / HdfsSeekRead.java
Last active September 21, 2019 09:25
hdfs seek benchmark
// import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.google.common.base.Stopwatch;
import java.io.IOException;
@t3rmin4t0r
t3rmin4t0r / xcross.py
Last active November 6, 2015 23:23
Cross product optimizer query gen
divisor = 19;
print """
drop table if exists postal_distribution;
CREATE TABLE if not exists postal_distribution(
zipcode varchar(6), lat double, lon double)
partitioned by (xcross int)
stored as orc;
@t3rmin4t0r
t3rmin4t0r / cpu_count.awk
Created January 9, 2016 14:28
Total # of physical cpus from CPU Info
BEGIN {FS=":";}
/^physical id/ {phys=$2}
/^cpu cores/ {cores[phys]=$2;}
END { total = 0; for(i in cores) { total = total + cores[i]} print "Total cores = ", total}
@t3rmin4t0r
t3rmin4t0r / MonBuffers.java
Created January 15, 2016 06:59
Monitoring direct memory in the JVM (from https://blogs.oracle.com/alanb/entry/monitoring_direct_buffers, adapted for JDK8)
import java.io.File;
import java.util.*;
import java.lang.management.BufferPoolMXBean;
import java.lang.management.ManagementFactory;
import javax.management.MBeanServerConnection;
import javax.management.ObjectName;
import javax.management.remote.*;
import com.sun.tools.attach.VirtualMachine; // Attach API
@t3rmin4t0r
t3rmin4t0r / history.pig.py
Created February 2, 2016 00:26
PIG Script to read history files
import os, sys, re, math
from org.apache.pig.scripting import *
operation = Pig.compile("""
set pig.splitCombination false;
set tez.grouping.min-size 52428800;
set tez.grouping.max-size 52428800;
@t3rmin4t0r
t3rmin4t0r / slow-packets.py
Last active October 13, 2022 20:49
tcpdump analysis for delayed packets
import sys, re, os, math
import dpkt
import socket
from collections import defaultdict
def ip_str(ip):
return socket.inet_ntoa(ip)
class Connection(object):
def __init__(self):
@t3rmin4t0r
t3rmin4t0r / rpm-urls.py
Created August 5, 2016 23:25
Get URLs for all HDP rpms
import yum
yb = yum.YumBase()
yb.setCacheDir()
pkgs=[p for p in yb.pkgSack.returnNewestByNameArch(patterns='*.rpm') if 'HDP' in p.repoid]
for p in pkgs:
print "wget -c ", p.remote_url
@t3rmin4t0r
t3rmin4t0r / tez2graph.py
Last active April 22, 2022 02:28
Convert Hive Tez explains into images for debugging
import re, sys
NX = True
try:
import networkx as nx
except:
NX = False
sys.stderr.write("Could not import nx\npip install networkx, please\n")
plan39 = """
Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 8 (BROADCAST_EDGE), Reducer 7 (BROADCAST_EDGE), Reducer 9 (BROADCAST_EDGE)