刘钰帆 yufan-liu

## beeline
$ sudo bin/beeline
beeline > !connect jdbc:hive2://localhost:10000/default

## gist:09f8c71073c26540d036
<build>
    <directory>${pom.basedir}/target</directory>
    <outputDirectory>${pom.build.directory}/classes</outputDirectory>
    <finalName>${pom.artifactId}-${pom.version}</finalName>
    <testOutputDirectory>${pom.build.directory}/test-classes</testOutputDirectory>
    <sourceDirectory>${pom.basedir}/src/main/java</sourceDirectory>
    <scriptSourceDirectory>src/main/scripts</scriptSourceDirectory>
    <testSourceDirectory>${pom.basedir}/src/test/java</testSourceDirectory>
    <resources>
      <resource>

## maven dependency exclusions
<dependency>
    <!-- 子项目，被依赖的子项目必须经过maven的install安装，不然maven还是找不到！ -->
    <groupId>bgonline</groupId>
    <artifactId>foundation</artifactId>
    <version>1.0.0</version>
    <!-- 排除 -->
    <exclusions>
        <exclusion>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>

## log4j.properties
 ### set log levels ###
log4j.rootLogger = info ,  stdout
#log4j.rootLogger = info ,  stdout ,  D

### 输出到控制台 ###
log4j.appender.stdout = org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target = System.out
log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern =  %d{yyyy-MM-dd HH:mm:ss} [%p][%t] %c - %m%n

## gist:8a34608404e25f203523
<build>
        <resources>
            <resource>
                <directory>${pom.basedir}/src/main/resources</directory>
            </resource>
        </resources>
        <testResources>
            <testResource>
                <directory>${pom.basedir}/src/test/resources</directory>
            </testResource>

## central event dispatcher
// version spark 1.0.0
// file org.apache.spark.scheduler.LiveListenerBus

  private val listenerThread = new Thread("SparkListenerBus") {
    setDaemon(true)
    override def run(): Unit = Utils.logUncaughtExceptions {
      while (true) {
        eventLock.acquire()
        // Atomically remove and process this event
        LiveListenerBus.this.synchronized {

## replay event dispatcher
// version spark 1.0.0
// file org.apache.spark.scheduler.ReplayListenerBus

  /**
   * Replay each event in the order maintained in the given logs.
   * This should only be called exactly once.
   */
  def replay() {
    assert(!replayed, "ReplayListenerBus cannot replay events more than once")
    logPaths.foreach { path =>

## add src table to inputs for authorization
# src/main/scala/shark/parse/SharkSemanticAnalyzer.scala

@@ -20,9 +20,7 @@ package shark.parse
 import java.lang.reflect.Method
 import java.util.ArrayList
 import java.util.{List => JavaList}
-
 import scala.collection.JavaConversions._
-
 import org.apache.hadoop.fs.Path

## js - file size human readable
function humanFileSize(size) {
    var i = Math.floor( Math.log(size) / Math.log(1024) );
    return ( size / Math.pow(1024, i) ).toFixed(2) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
};

## separator.py
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split

    returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
	$ sudo bin/beeline
	beeline > !connect jdbc:hive2://localhost:10000/default
	<build>
	<directory>${pom.basedir}/target</directory>
	<outputDirectory>${pom.build.directory}/classes</outputDirectory>
	<finalName>${pom.artifactId}-${pom.version}</finalName>
	<testOutputDirectory>${pom.build.directory}/test-classes</testOutputDirectory>
	<sourceDirectory>${pom.basedir}/src/main/java</sourceDirectory>
	<scriptSourceDirectory>src/main/scripts</scriptSourceDirectory>
	<testSourceDirectory>${pom.basedir}/src/test/java</testSourceDirectory>
	<resources>
	<resource>
	<dependency>
	<!-- 子项目，被依赖的子项目必须经过maven的install安装，不然maven还是找不到！ -->
	<groupId>bgonline</groupId>
	<artifactId>foundation</artifactId>
	<version>1.0.0</version>
	<!-- 排除 -->
	<exclusions>
	<exclusion>
	<groupId>log4j</groupId>
	<artifactId>log4j</artifactId>
	### set log levels ###
	log4j.rootLogger = info , stdout
	#log4j.rootLogger = info , stdout , D

	### 输出到控制台 ###
	log4j.appender.stdout = org.apache.log4j.ConsoleAppender
	log4j.appender.stdout.Target = System.out
	log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
	log4j.appender.stdout.layout.ConversionPattern = %d{yyyy-MM-dd HH:mm:ss} [%p][%t] %c - %m%n
	// version spark 1.0.0
	// file org.apache.spark.scheduler.LiveListenerBus

	private val listenerThread = new Thread("SparkListenerBus") {
	setDaemon(true)
	override def run(): Unit = Utils.logUncaughtExceptions {
	while (true) {
	eventLock.acquire()
	// Atomically remove and process this event
	LiveListenerBus.this.synchronized {
	// version spark 1.0.0
	// file org.apache.spark.scheduler.ReplayListenerBus

	/**
	* Replay each event in the order maintained in the given logs.
	* This should only be called exactly once.
	*/
	def replay() {
	assert(!replayed, "ReplayListenerBus cannot replay events more than once")
	logPaths.foreach { path =>
	# src/main/scala/shark/parse/SharkSemanticAnalyzer.scala

	@@ -20,9 +20,7 @@ package shark.parse
	import java.lang.reflect.Method
	import java.util.ArrayList
	import java.util.{List => JavaList}
	-
	import scala.collection.JavaConversions._
	-
	import org.apache.hadoop.fs.Path
	function humanFileSize(size) {
	var i = Math.floor( Math.log(size) / Math.log(1024) );
	return ( size / Math.pow(1024, i) ).toFixed(2) * 1 + ' ' + ['B', 'kB', 'MB', 'GB', 'TB'][i];
	};
	def splitDataFrameList(df,target_column,separator):
	''' df = dataframe to split,
	target_column = the column containing the values to split
	separator = the symbol used to perform the split

	returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
	The values in the other columns are duplicated across the newly divided rows.
	'''
	def splitListToRows(row,row_accumulator,target_column,separator):
	split_row = row[target_column].split(separator)