Skip to content

Instantly share code, notes, and snippets.

@t3rmin4t0r
t3rmin4t0r / orc-stripe-verify.py
Created July 2, 2014 17:37
ORC stripe verifier
import sys
import re
S_RE = re.compile(r'Stripe: offset: ([0-9]*) data: ([0-9]*) rows: ([0-9]*).*')
items = [m.groups() for m in [S_RE.search(l) for l in sys.stdin] if m]
parsed = [(int(a),int(b), int(c)) for (a,b,c) in items]
stripe_size = 256*1024*1024
for (start, len, rows) in parsed:
if (start / stripe_size) != ((start+len) / stripe_size):
print start+len, "overflows", start, "block"
@t3rmin4t0r
t3rmin4t0r / Centos-Source.repo
Created July 30, 2014 13:29
Centos SRPMs repository
[base-source]
name=CentOS-$releasever - Base Source
baseurl=http://vault.centos.org/centos/$releasever/os/Source/
enabled=0
[updates-source]
name=CentOS-$releasever - Updates Source
baseurl=http://vault.centos.org/centos/$releasever/updates/Source/
enabled=0
@t3rmin4t0r
t3rmin4t0r / yarn-top-csv.py
Created August 25, 2014 05:55
yarn top csv generator
import sys
import re,math,string
from time import sleep
import os.path as path
import urllib2
from json import loads as json_parse
from collections import defaultdict
import getopt
import datetime
import time
@t3rmin4t0r
t3rmin4t0r / transpose.py
Created October 24, 2014 07:27
transpose.py
import sys,re,math,os
import curses
from time import sleep
running=re.compile(r'Status: Running \(application id: (application_[0-9_]*)\)')
tasks=re.compile(r'(Map [0-9]*|Reducer [0-9]*): ([\-0-9]*)(\(.[0-9]*\))?/([\-0-9]*)')
log = open("log", "w")
debug = lambda a: (log.write(str(a)+"\n"),log.flush())
@t3rmin4t0r
t3rmin4t0r / ddl.py
Created November 7, 2014 00:50
Hive Serialization.ddl to Create Table helper
import sys
for l in sys.stdin:
l = l.strip()
name = l[0:l.find("{")]
cols = [x.strip().split(" ") for x in l[l.find("{")+1:l.find("}")].split(",")]
print "create table ", name, "(",
print ",".join(["%s %s" % (b,a) for (a,b) in cols]), ");"
[gopal@cn041-10 comcast]$
@t3rmin4t0r
t3rmin4t0r / reducer-counters.py
Last active August 29, 2015 14:16
AM history parser for reducer skew checks
import sys
import re
def Counter(name):
pattern = re.compile("%s=([^,]*)" % name)
# warning closure
def get(l):
m = pattern.search(l)
if m:
return m.group(1)
@t3rmin4t0r
t3rmin4t0r / jstacker.py
Created June 25, 2015 19:43
JStack Trie Maker
import sys, re, os, math, os.path
from collections import defaultdict
class JStackTrie(object):
def __init__(self):
self.roots = defaultdict(JStackTrie)
self.count = 0
self.name = "root"
@t3rmin4t0r
t3rmin4t0r / ats-extract-plan.py
Created June 30, 2015 19:12
ATS to Hive query plan extraction
import json
import sys
class ATSFile(object):
def __init__(self, name):
self.data = json.load(open(name))
self.name = name
def dump(self):
info = self.data["otherinfo"]
q = json.loads(info["QUERY"])
@t3rmin4t0r
t3rmin4t0r / namenode-appminer.py
Created July 27, 2015 23:52
Extract list of files created per-application for the HDFS NN (from logs)
import re
import sys, math, os.path
from glob import glob
from itertools import groupby,chain
from collections import defaultdict
import re
def parse(f):
PAT = re.compile(r'DIR\* completeFile: ([^ ]*) is closed by ([^ ]*)')
@t3rmin4t0r
t3rmin4t0r / splitgz.awk
Created July 31, 2015 18:58
To Split and Gzip at the same time
#!/usr/bin/env gawk -f
BEGIN { id = 0;
cmd = "gzip -c -2";
ext = ".gz";
file = sprintf("%04d%s",id, ext);
print "Opening new file " file " at " NR " rows";
count = 1000000;
}
# Use pipes