r-wheeler/query_runner.py

## query_runner.py
import subprocess
import pandas as pd
import itertools


def query_driver(job, fname = False):
    """
    Submits a Hive query on the gateway machine. You will still need to log in
    enter your kerberos credentials in another ssh window prior to running.
    Hive query can be in the form of a raw string or the path to a script

    Args:
    job (string):    Job can be either a raw string query to pass to hive,
                     or a hive script to be submited

    fname (boolean): if job = path/to/script.hive, set fname to True


    Returns:
    ----------
    DataFrame of the query results.

    Be careful with how much data your query will pull back as all results
    are read into memory
    """

    hive_param = "-f" if fname else "-e"
    logfile = open('logfile','w')
    proc = subprocess.Popen(["ssh", "hadoop" , "hive" , "%r" % hive_param, "%r" % job],
                        stdin=subprocess.PIPE,
                        stdout=subprocess.PIPE,
                        stderr=logfile )
    data = [line.split('\t') for line in proc.stdout]
    cols = list(itertools.chain.from_iterable(data[:1]))
    df = pd.DataFrame(data[1:], columns = cols)
    logfile.close()

    return df

#Examples:
#Running script from a file:
t_frame = query_driver('/home/nmrw48/test.hive', fname=True)

#Running query passed in as param:
q = 'set hive.cli.print.header=true;describe table1;'
q_frame = query_driver(q)
	import subprocess
	import pandas as pd
	import itertools


	def query_driver(job, fname = False):
	"""
	Submits a Hive query on the gateway machine. You will still need to log in
	enter your kerberos credentials in another ssh window prior to running.
	Hive query can be in the form of a raw string or the path to a script

	Args:
	job (string): Job can be either a raw string query to pass to hive,
	or a hive script to be submited

	fname (boolean): if job = path/to/script.hive, set fname to True


	Returns:
	----------
	DataFrame of the query results.

	Be careful with how much data your query will pull back as all results
	are read into memory
	"""

	hive_param = "-f" if fname else "-e"
	logfile = open('logfile','w')
	proc = subprocess.Popen(["ssh", "hadoop" , "hive" , "%r" % hive_param, "%r" % job],
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE,
	stderr=logfile )
	data = [line.split('\t') for line in proc.stdout]
	cols = list(itertools.chain.from_iterable(data[:1]))
	df = pd.DataFrame(data[1:], columns = cols)
	logfile.close()

	return df

	#Examples:
	#Running script from a file:
	t_frame = query_driver('/home/nmrw48/test.hive', fname=True)

	#Running query passed in as param:
	q = 'set hive.cli.print.header=true;describe table1;'
	q_frame = query_driver(q)