Skip to content

Instantly share code, notes, and snippets.

@rubensayshi
Created April 26, 2012 13:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rubensayshi/2499436 to your computer and use it in GitHub Desktop.
Save rubensayshi/2499436 to your computer and use it in GitHub Desktop.
Hive MAPJOIN + LATERAL VIEW
Logging initialized using configuration in jar:file:/opt/hive-0.8.1-bin/lib/hive-common-0.8.1.jar!/hive-log4j.properties
12/07/04 10:09:16 INFO SessionState: Logging initialized using configuration in jar:file:/opt/hive-0.8.1-bin/lib/hive-common-0.8.1.jar!/hive-log4j.properties
Hive history file=/tmp/hdfs/hive_job_log_hdfs_201207041009_796357994.txt
12/07/04 10:09:16 INFO exec.HiveHistory: Hive history file=/tmp/hdfs/hive_job_log_hdfs_201207041009_796357994.txt
ADD JAR /usr/local/hyves/bin/logging/hadoop/goldmine.jar
12/07/04 10:09:16 INFO SessionState: Added /usr/local/hyves/bin/logging/hadoop/goldmine.jar to class path
12/07/04 10:09:16 INFO SessionState: Added resource: /usr/local/hyves/bin/logging/hadoop/goldmine.jar
ADD JAR /usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar
12/07/04 10:09:16 INFO SessionState: Added /usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar to class path
12/07/04 10:09:16 INFO SessionState: Added resource: /usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar
CREATE TEMPORARY FUNCTION age AS 'org.hyves.hive.udf.Age'
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=compile>
12/07/04 10:09:16 INFO parse.ParseDriver: Parsing command:
CREATE TEMPORARY FUNCTION age AS 'org.hyves.hive.udf.Age'
12/07/04 10:09:16 INFO parse.ParseDriver: Parse Completed
12/07/04 10:09:16 INFO parse.FunctionSemanticAnalyzer: analyze done
12/07/04 10:09:16 INFO ql.Driver: Semantic Analysis Completed
12/07/04 10:09:16 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:null, properties:null)
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=compile start=1341389356707 end=1341389356907 duration=200>
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=Driver.execute>
12/07/04 10:09:16 INFO ql.Driver: Starting command:
CREATE TEMPORARY FUNCTION age AS 'org.hyves.hive.udf.Age'
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389356907 end=1341389356926 duration=19>
12/07/04 10:09:16 INFO ql.Driver: OK
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=releaseLocks>
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389356926 end=1341389356926 duration=0>
12/07/04 10:09:16 INFO CliDriver: Time taken: 0.22 seconds
CREATE TEMPORARY FUNCTION age_group AS 'org.hyves.hive.udf.AgeGroup'
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=compile>
12/07/04 10:09:16 INFO parse.ParseDriver: Parsing command:
CREATE TEMPORARY FUNCTION age_group AS 'org.hyves.hive.udf.AgeGroup'
12/07/04 10:09:16 INFO parse.ParseDriver: Parse Completed
12/07/04 10:09:16 INFO parse.FunctionSemanticAnalyzer: analyze done
12/07/04 10:09:16 INFO ql.Driver: Semantic Analysis Completed
12/07/04 10:09:16 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:null, properties:null)
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=compile start=1341389356927 end=1341389356929 duration=2>
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=Driver.execute>
12/07/04 10:09:16 INFO ql.Driver: Starting command:
CREATE TEMPORARY FUNCTION age_group AS 'org.hyves.hive.udf.AgeGroup'
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389356929 end=1341389356931 duration=2>
12/07/04 10:09:16 INFO ql.Driver: OK
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=releaseLocks>
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389356932 end=1341389356932 duration=0>
12/07/04 10:09:16 INFO CliDriver: Time taken: 0.0050 seconds
USE hyves_goldmine
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=compile>
12/07/04 10:09:16 INFO parse.ParseDriver: Parsing command:
USE hyves_goldmine
12/07/04 10:09:16 INFO parse.ParseDriver: Parse Completed
12/07/04 10:09:16 INFO ql.Driver: Semantic Analysis Completed
12/07/04 10:09:16 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:null, properties:null)
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=compile start=1341389356932 end=1341389356946 duration=14>
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=Driver.execute>
12/07/04 10:09:16 INFO ql.Driver: Starting command:
USE hyves_goldmine
12/07/04 10:09:17 INFO metastore.HiveMetaStore: 0: Opening raw store with implemenation class:org.apache.hadoop.hive.metastore.ObjectStore
12/07/04 10:09:17 INFO metastore.ObjectStore: ObjectStore, initialize called
12/07/04 10:09:17 ERROR DataNucleus.Plugin: Bundle "org.eclipse.jdt.core" requires "org.eclipse.core.resources" but it cannot be resolved.
12/07/04 10:09:17 ERROR DataNucleus.Plugin: Bundle "org.eclipse.jdt.core" requires "org.eclipse.core.runtime" but it cannot be resolved.
12/07/04 10:09:17 ERROR DataNucleus.Plugin: Bundle "org.eclipse.jdt.core" requires "org.eclipse.text" but it cannot be resolved.
12/07/04 10:09:18 INFO metastore.ObjectStore: Setting MetaStore object pin classes with hive.metastore.cache.pinobjtypes="Table,StorageDescriptor,SerDeInfo,Partition,Database,Type,FieldSchema,Order"
12/07/04 10:09:18 INFO metastore.ObjectStore: Initialized ObjectStore
12/07/04 10:09:18 INFO metastore.HiveMetaStore: 0: get_database: hyves_goldmine
12/07/04 10:09:18 INFO metastore.HiveMetaStore: 0: get_database: hyves_goldmine
12/07/04 10:09:18 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389356946 end=1341389358808 duration=1862>
12/07/04 10:09:18 INFO ql.Driver: OK
12/07/04 10:09:18 INFO ql.Driver: <PERFLOG method=releaseLocks>
12/07/04 10:09:18 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389358808 end=1341389358808 duration=0>
12/07/04 10:09:18 INFO CliDriver: Time taken: 1.876 seconds
set hive.cli.print.header=true
set hive.optimize.cp=false
SELECT
date_int as date_int,
gender as gender,
part as part,
COUNT( DISTINCT memberId ) as uniq,
SUM( partstats['seconds'] ) as sec
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.memberId,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.memberId)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
WHERE date_int=20120101
GROUP BY date_int, gender, part
12/07/04 10:09:18 INFO ql.Driver: <PERFLOG method=compile>
12/07/04 10:09:18 INFO parse.ParseDriver: Parsing command:
SELECT
date_int as date_int,
gender as gender,
part as part,
COUNT( DISTINCT memberId ) as uniq,
SUM( partstats['seconds'] ) as sec
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.memberId,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.memberId)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
WHERE date_int=20120101
GROUP BY date_int, gender, part
12/07/04 10:09:18 INFO parse.ParseDriver: Parse Completed
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Starting Semantic Analysis
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Completed phase 1 of Semantic Analysis
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Get metadata for source tables
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Get metadata for subqueries
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Get metadata for source tables
12/07/04 10:09:18 INFO metastore.HiveMetaStore: 0: get_table : db=hyves_goldmine tbl=visit_stats
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform}
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform}
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_table : db=hyves_goldmine tbl=members_map
12/07/04 10:09:19 INFO parse.ParseDriver: Parsing command: SELECT `member_id` AS `member_id`, `gender` AS `gender`, `birthdate` AS `birthdate` FROM (SELECT `members_map_full`.`member_id`, `members_map_full`.`gender`, `members_map_full`.`birthdate` FROM `members_map_full` WHERE `members_map_full`.`member_id` % 250 = 0) `members_map`
12/07/04 10:09:19 INFO parse.ParseDriver: Parse Completed
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for subqueries
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for source tables
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for subqueries
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for source tables
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_table : db=hyves_goldmine tbl=members_map_full
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate}
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for subqueries
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Completed getting MetaData in Semantic Analysis
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate}
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for FS(20)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(19)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for GBY(18)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for RS(17)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for GBY(16)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(15)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for FIL(14)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of FIL For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col0 = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for LVJ(13)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of LVJ For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col0 = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(10)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of SEL For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col0 = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for UDTF(12)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(11)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for LVF(9)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(8)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of SEL For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col6 = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for JOIN(7)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of JOIN For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (VALUE._col6 = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for RS(5)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of RS For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (date_int = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for TS(4)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of TS For Alias : visit_stats
12/07/04 10:09:19 INFO ppd.OpProcFactory: (date_int = 20120101)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for RS(6)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(3)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(2)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for FIL(1)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of FIL For Alias : members_map_full
12/07/04 10:09:19 INFO ppd.OpProcFactory: ((member_id % 250) = 0)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for TS(0)
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of TS For Alias : members_map_full
12/07/04 10:09:19 INFO ppd.OpProcFactory: ((member_id % 250) = 0)
12/07/04 10:09:19 INFO ppr.PartitionPruner: <PERFLOG method=prune-listing>
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_partition_names : db=hyves_goldmine tbl=visit_stats
12/07/04 10:09:19 INFO ppr.PartitionPruner: </PERFLOG method=prune-listing start=1341389359513 end=1341389359726 duration=213>
12/07/04 10:09:19 INFO ppr.PartitionPruner: <PERFLOG method=partition-retrieving>
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_partitions_by_names : db=hyves_goldmine tbl=visit_stats
12/07/04 10:09:19 INFO ppr.PartitionPruner: </PERFLOG method=partition-retrieving start=1341389359726 end=1341389359755 duration=29>
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform}
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform}
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform}
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate}
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate}
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate}
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Looking for table scans where optimization is applicable
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Found 0 metadata only table scans
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Looking for table scans where optimization is applicable
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Found 0 metadata only table scans
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Completed plan generation
12/07/04 10:09:19 INFO ql.Driver: Semantic Analysis Completed
12/07/04 10:09:19 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:date_int, type:string, comment:null), FieldSchema(name:gender, type:int, comment:null), FieldSchema(name:part, type:string, comment:null), FieldSchema(name:uniq, type:bigint, comment:null), FieldSchema(name:sec, type:bigint, comment:null)], properties:null)
12/07/04 10:09:19 INFO ql.Driver: </PERFLOG method=compile start=1341389358811 end=1341389359843 duration=1032>
12/07/04 10:09:19 INFO ql.Driver: <PERFLOG method=Driver.execute>
12/07/04 10:09:19 INFO ql.Driver: Starting command:
SELECT
date_int as date_int,
gender as gender,
part as part,
COUNT( DISTINCT memberId ) as uniq,
SUM( partstats['seconds'] ) as sec
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.memberId,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.memberId)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
WHERE date_int=20120101
GROUP BY date_int, gender, part
Total MapReduce jobs = 1
12/07/04 10:09:19 INFO ql.Driver: Total MapReduce jobs = 1
12/07/04 10:09:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
12/07/04 10:09:19 INFO exec.MapredLocalTask: Generating plan file file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10005/plan.xml
12/07/04 10:09:20 INFO exec.MapredLocalTask: Executing: /usr/lib/hadoop-0.20/bin/hadoop jar /opt/hive-0.8.1-bin/lib/hive-exec-0.8.1.jar org.apache.hadoop.hive.ql.exec.ExecDriver -localtask -plan file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10005/plan.xml -jobconf hive.stats.jdbcdriver=org.apache.derby.jdbc.EmbeddedDriver -jobconf hive.mapjoin.followby.gby.localtask.max.memory.usage=0.55 -jobconf hive.limit.optimize.limit.file=10 -jobconf hive.metastore.archive.intermediate.archived=_INTERMEDIATE_ARCHIVED -jobconf datanucleus.validateColumns=false -jobconf hive.mapred.partitioner=org.apache.hadoop.hive.ql.io.DefaultHivePartitioner -jobconf hive.cli.errors.ignore=false -jobconf hive.downloaded.resources.dir=%2Ftmp%2Fhdfs%2Fhive_resources -jobconf hive.lockmgr.zookeeper.default.partition.name=__HIVE_DEFAULT_ZOOKEEPER_PARTITION__ -jobconf hive.zookeeper.session.timeout=600000 -jobconf datanucleus.connectionPoolingType=DBCP -jobconf hive.variable.substitute=true -jobconf dfs.namenode.decommission.nodes.per.interval=5 -jobconf dfs.https.need.client.auth=false -jobconf hive.stats.dbclass=jdbc%3Aderby -jobconf hive.ppd.remove.duplicatefilters=true -jobconf hive.lock.mapred.only.operation=false -jobconf dfs.datanode.data.dir.perm=700 -jobconf hive.script.serde=org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -jobconf dfs.datanode.address=0.0.0.0%3A50010 -jobconf dfs.namenode.logging.level=info -jobconf dfs.block.access.token.enable=false -jobconf hive.merge.smallfiles.avgsize=16000000 -jobconf datanucleus.cache.level2=false -jobconf hive.hbase.wal.enabled=true -jobconf dfs.safemode.threshold.pct=0.999f -jobconf dfs.blockreport.initialDelay=0 -jobconf dfs.namenode.handler.count=10 -jobconf hive.security.authorization.manager=org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider -jobconf hive.merge.current.job.has.dynamic.partitions=false -jobconf hive.metastore.rawstore.impl=org.apache.hadoop.hive.metastore.ObjectStore -jobconf hive.metastore.local=true -jobconf hive.optimize.bucketmapjoin=false -jobconf hive.optimize.ppd.storage=true -jobconf hive.querylog.location=%2Ftmp%2Fhdfs -jobconf hive.limit.row.max.size=100000 -jobconf dfs.block.access.key.update.interval=600 -jobconf hive.rework.mapredwork=false -jobconf dfs.safemode.extension=30000 -jobconf hive.index.compact.query.max.entries=10000000 -jobconf hive.metastore.authorization.storage.checks=false -jobconf hive.autogen.columnalias.prefix.includefuncname=false -jobconf hive.zookeeper.namespace=hive_zookeeper_namespace -jobconf hive.test.mode.prefix=test_ -jobconf hive.merge.rcfile.block.level=true -jobconf hive.test.mode=false -jobconf hive.exec.compress.intermediate=false -jobconf datanucleus.cache.level2.type=SOFT -jobconf dfs.https.server.keystore.resource=ssl-server.xml -jobconf hive.metastore.ds.retry.attempts=1 -jobconf hive.limit.optimize.enable=false -jobconf hive.zookeeper.client.port=2181 -jobconf dfs.block.access.token.lifetime=600 -jobconf hive.exec.perf.logger=org.apache.hadoop.hive.ql.log.PerfLogger -jobconf javax.jdo.option.ConnectionUserName=hadoop -jobconf dfs.name.edits.dir=%24%7Bdfs.name.dir%7D -jobconf hive.merge.mapfiles=true -jobconf hive.test.mode.samplefreq=32 -jobconf hive.optimize.skewjoin=false -jobconf hive.optimize.index.groupby=false -jobconf hive.metastore.server.min.threads=200 -jobconf hive.mapjoin.localtask.max.memory.usage=0.9 -jobconf dfs.block.size=67108864 -jobconf hive.map.aggr.hash.min.reduction=0.5 -jobconf hive.exec.compress.output=false -jobconf dfs.datanode.ipc.address=0.0.0.0%3A50020 -jobconf javax.jdo.option.Multithreaded=true -jobconf hive.script.recordreader=org.apache.hadoop.hive.ql.exec.TextRecordReader -jobconf dfs.permissions=true -jobconf hive.multigroupby.singlemr=false -jobconf hive.lock.numretries=100 -jobconf hive.optimize.metadataonly=true -jobconf hive.exec.parallel.thread.number=8 -jobconf hive.exec.default.partition.name=__HIVE_DEFAULT_PARTITION__ -jobconf hive.exec.max.created.files=100000 -jobconf hive.archive.har.parentdir.settable=false -jobconf hive.metastore.event.clean.freq=0 -jobconf dfs.datanode.https.address=0.0.0.0%3A50475 -jobconf hive.exec.mode.local.auto=false -jobconf dfs.secondary.http.address=0.0.0.0%3A50090 -jobconf hive.optimize.index.filter=false -jobconf datanucleus.storeManagerType=rdbms -jobconf dfs.replication.max=512 -jobconf hive.script.operator.id.env.var=HIVE_SCRIPT_OPERATOR_ID -jobconf hive.exec.mode.local.auto.inputbytes.max=134217728 -jobconf mapred.min.split.size=1 -jobconf dfs.namenode.delegation.token.renew-interval=86400000 -jobconf hive.mapjoin.size.key=10000 -jobconf hive.metastore.ds.retry.interval=1000 -jobconf hive.skewjoin.mapjoin.min.split=33554432 -jobconf hive.metastore.client.connect.retry.delay=1 -jobconf hive.auto.convert.join=false -jobconf dfs.https.client.keystore.resource=ssl-client.xml -jobconf hive.metastore.warehouse.dir=%2Fuser%2Fhive%2Fwarehouse -jobconf hive.mapjoin.bucket.cache.size=100 -jobconf hive.exec.job.debug.timeout=30000 -jobconf datanucleus.transactionIsolation=read-committed -jobconf hive.stats.jdbc.timeout=30 -jobconf hive.mergejob.maponly=true -jobconf dfs.https.address=0.0.0.0%3A50470 -jobconf dfs.balance.bandwidthPerSec=1048576 -jobconf hive.fetch.output.serde=org.apache.hadoop.hive.serde2.DelimitedJSONSerDe -jobconf hive.root.logger=INFO%2Cconsole -jobconf hive.exec.script.trust=false -jobconf hive.mapjoin.followby.map.aggr.hash.percentmemory=0.3 -jobconf hive.exim.uri.scheme.whitelist=hdfs%2Cpfile -jobconf hive.stats.dbconnectionstring=jdbc%3Aderby%3A%3BdatabaseName%3DTempStatsStore%3Bcreate%3Dtrue -jobconf mapred.reduce.tasks=-1 -jobconf hive.optimize.index.filter.compact.minsize=5368709120 -jobconf hive.skewjoin.key=100000 -jobconf javax.jdo.option.ConnectionDriverName=com.mysql.jdbc.Driver -jobconf hive.security.authenticator.manager=org.apache.hadoop.hive.ql.security.HadoopDefaultAuthenticator -jobconf dfs.max.objects=0 -jobconf mapred.input.dir.recursive=false -jobconf hive.udtf.auto.progress=false -jobconf hive.session.id=hdfs_201207041009 -jobconf mapred.job.name= -jobconf dfs.datanode.dns.nameserver=default -jobconf hive.exec.script.maxerrsize=100000 -jobconf dfs.blockreport.intervalMsec=3600000 -jobconf hive.optimize.groupby=true -jobconf datanucleus.plugin.pluginRegistryBundleCheck=LOG -jobconf hive.exec.rowoffset=false -jobconf hive.default.fileformat=TextFile -jobconf hive.hadoop.supports.splittable.combineinputformat=false -jobconf hive.metastore.archive.intermediate.original=_INTERMEDIATE_ORIGINAL -jobconf hive.mapjoin.smalltable.filesize=25000000 -jobconf hive.exec.scratchdir=%2Ftmp%2Fhive-hdfs -jobconf datanucleus.identifierFactory=datanucleus -jobconf hive.exec.max.dynamic.partitions.pernode=100 -jobconf hive.stats.retries.max=0 -jobconf dfs.client.block.write.retries=3 -jobconf hive.join.emit.interval=1000 -jobconf hive.script.recordwriter=org.apache.hadoop.hive.ql.exec.TextRecordWriter -jobconf datanucleus.validateConstraints=false -jobconf hive.exec.dynamic.partition=false -jobconf hive.hashtable.loadfactor=0.75 -jobconf dfs.https.enable=false -jobconf hive.sample.seednumber=0 -jobconf dfs.datanode.failed.volumes.tolerated=0 -jobconf hive.optimize.index.filter.compact.maxsize=-1 -jobconf hive.metastore.client.socket.timeout=20 -jobconf hive.map.aggr.hash.force.flush.memory.threshold=0.9 -jobconf dfs.namenode.delegation.key.update-interval=86400000 -jobconf hive.exec.show.job.failure.debug.info=true -jobconf hive.join.cache.size=25000 -jobconf hive.mapper.cannot.span.multiple.partitions=false -jobconf dfs.default.chunk.view.size=32768 -jobconf datanucleus.autoStartMechanismMode=checked -jobconf mapred.min.split.size.per.node=1 -jobconf hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat -jobconf hive.skewjoin.mapjoin.map.tasks=10000 -jobconf hive.index.compact.query.max.size=10737418240 -jobconf hive.exec.drop.ignorenonexistent=true -jobconf hive.hwi.listen.port=9999 -jobconf hive.session.silent=false -jobconf hive.cli.print.header=true -jobconf dfs.datanode.du.reserved=0 -jobconf hadoop.bin.path=%2Fusr%2Flib%2Fhadoop-0.20%2Fbin%2Fhadoop -jobconf dfs.web.ugi=webuser%2Cwebgroup -jobconf dfs.df.interval=60000 -jobconf hive.groupby.mapaggr.checkinterval=100000 -jobconf mapred.min.split.size.per.rack=1 -jobconf dfs.data.dir=%24%7Bdfs.name.dir%7D%2Fdata -jobconf hive.metastore.server.max.threads=100000 -jobconf dfs.datanode.dns.interface=default -jobconf hive.metastore.kerberos.principal=hive-metastore%2F_HOST%40EXAMPLE.COM -jobconf javax.jdo.option.ConnectionURL=jdbc%3Amysql%3A%2F%2Flocalhost%3A3306%2Fhive%3FcreateDatabaseIfNotExist%3Dtrue -jobconf hive.mapred.mode=nonstrict -jobconf hive.metastore.batch.retrieve.max=300 -jobconf hive.outerjoin.supports.filters=true -jobconf hive.groupby.skewindata=false -jobconf dfs.support.append=true -jobconf hive.exec.mode.local.auto.tasks.max=4 -jobconf hive.exec.parallel=false -jobconf hive.debug.localtask=false -jobconf dfs.permissions.supergroup=supergroup -jobconf hive.zookeeper.clean.extra.nodes=false -jobconf dfs.replication.min=1 -jobconf hive.merge.size.per.task=256000000 -jobconf fs.har.impl=org.apache.hadoop.hive.shims.HiveHarFileSystem -jobconf hive.unlock.numretries=10 -jobconf hive.exec.reducers.max=999 -jobconf hive.cli.print.current.db=false -jobconf hive.exec.tasklog.debug.timeout=20000 -jobconf dfs.namenode.decommission.interval=30 -jobconf hive.metastore.archive.intermediate.extracted=_INTERMEDIATE_EXTRACTED -jobconf hive.metastore.execute.setugi=false -jobconf dfs.http.address=hadoop-nn.internal%3A50070 -jobconf hive.query.string=%0ASELECT%0A++++date_int+as+date_int%2C%0A++++gender+as+gender%2C%0A++++part+as+part%2C%0A++++COUNT%28+DISTINCT+memberId+%29+as+uniq%2C%0A++++SUM%28+partstats%5B%27seconds%27%5D+%29+as+sec%0AFROM+%28%0A++++SELECT%0A++++++++%2F*%2B+MAPJOIN%28members_map%29+*%2F%0A++++++++date_int%2C%0A++++++++visit_stats.memberId%2C%0A++++++++members_map.gender+as+gender%2C%0A++++++++parts%0A++++FROM+visit_stats%0A++++JOIN+members_map+ON%28members_map.member_id+%3D+visit_stats.memberId%29%0A%29+visit_stats%0ALATERAL+VIEW+explode%28parts%29+partsTable+AS+part%2C+partstats%0AWHERE+date_int%3D20120101%0AGROUP+BY+date_int%2C+gender%2C+part -jobconf dfs.heartbeat.interval=3 -jobconf hive.stats.collect.rawdatasize=true -jobconf hive.optimize.ppd=true -jobconf dfs.name.dir=%2Fvar%2Flib%2Fhadoop -jobconf hive.stats.retries.wait=3000 -jobconf hive.metastore.cache.pinobjtypes=Table%2CStorageDescriptor%2CSerDeInfo%2CPartition%2CDatabase%2CType%2CFieldSchema%2COrder -jobconf hive.metastore.fs.handler.class=org.apache.hadoop.hive.metastore.HiveMetaStoreFsImpl -jobconf dfs.datanode.http.address=0.0.0.0%3A50075 -jobconf hive.mapjoin.cache.numrows=25000 -jobconf hive.merge.mapredfiles=false -jobconf hive.metastore.sasl.enabled=false -jobconf hive.start.cleanup.scratchdir=false -jobconf hive.fileformat.check=true -jobconf hive.metastore.connect.retries=5 -jobconf hive.jobname.length=50 -jobconf hive.query.id=hdfs_20120704100909_28d0a9d7-3a7d-484c-9906-7783ebf5f9d1 -jobconf javax.jdo.option.DetachAllOnCommit=true -jobconf javax.jdo.option.ConnectionPassword=hadoop -jobconf hive.index.compact.file.ignore.hdfs=false -jobconf hive.hwi.listen.host=0.0.0.0 -jobconf hive.optimize.reducededuplication=true -jobconf hive.optimize.index.autoupdate=false -jobconf dfs.replication.interval=3 -jobconf hive.auto.progress.timeout=0 -jobconf hive.ppd.recognizetransivity=true -jobconf hive.metastore.force.reload.conf=false -jobconf hive.map.aggr.hash.percentmemory=0.5 -jobconf hive.mapjoin.check.memory.rows=100000 -jobconf javax.jdo.PersistenceManagerFactoryClass=org.datanucleus.jdo.JDOPersistenceManagerFactory -jobconf dfs.replication=3 -jobconf hive.exec.concatenate.check.index=true -jobconf hive.mapred.local.mem=0 -jobconf datanucleus.validateTables=false -jobconf hive.exec.max.dynamic.partitions=1000 -jobconf hive.merge.input.format.block.level=org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileBlockMergeInputFormat -jobconf hive.insert.into.multilevel.dirs=false -jobconf hive.exec.submitviachild=false -jobconf hive.exec.dynamic.partition.mode=strict -jobconf mapred.max.split.size=256000000 -jobconf hive.script.auto.progress=false -jobconf hive.lock.sleep.between.retries=60 -jobconf hive.enforce.sorting=false -jobconf datanucleus.autoCreateSchema=true -jobconf hive.task.progress=false -jobconf hive.index.compact.binary.search=true -jobconf hive.metastore.server.tcp.keepalive=true -jobconf hive.hashtable.initialCapacity=100000 -jobconf hive.lock.manager=org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLockManager -jobconf hive.stats.autogather=true -jobconf dfs.access.time.precision=3600000 -jobconf hive.map.aggr=true -jobconf hadoop.config.dir=%2Fetc%2Fhadoop-0.20%2Fconf -jobconf hive.enforce.bucketing=false -jobconf hive.limit.optimize.fetch.max=50000 -jobconf dfs.safemode.min.datanodes=0 -jobconf hive.autogen.columnalias.prefix.label=_c -jobconf hive.exec.script.allow.partial.consumption=false -jobconf hive.archive.enabled=false -jobconf hive.optimize.bucketmapjoin.sortedmerge=false -jobconf hive.support.concurrency=false -jobconf hive.mapred.reduce.tasks.speculative.execution=true -jobconf hive.heartbeat.interval=1000 -jobconf hive.stats.atomic=false -jobconf hive.exec.counters.pull.interval=1000 -jobconf hive.metastore.event.expiry.duration=0 -jobconf hive.query.result.fileformat=TextFile -jobconf hive.optimize.cp=false -jobconf javax.jdo.option.NonTransactionalRead=true -jobconf dfs.datanode.handler.count=3 -jobconf hive.exec.reducers.bytes.per.reducer=1000000000 -jobconf dfs.replication.considerLoad=true -jobconf dfs.namenode.delegation.token.max-lifetime=604800000 -jobconf hive.error.on.empty.partition=false -jobconf hive.security.authorization.enabled=false
Execution log at:
2012-07-04 10:09:21 Starting to launch local task to process map join; maximum memory = 932118528
2012-07-04 10:09:31 Processing rows: 46671 Hashtable size: 46671 Memory usage: 21544424 rate: 0.023
2012-07-04 10:09:31 Dump the hashtable into file: file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/MapJoin-members_map-01--.hashtable
2012-07-04 10:09:33 Upload 1 File to: file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/MapJoin-members_map-01--.hashtable File size: 2800256
2012-07-04 10:09:33 End of local task; Time Taken: 11.962 sec.
Execution completed successfully
12/07/04 10:09:33 INFO exec.Task: Execution completed successfully
Mapred Local Task Succeeded . Convert the Join into MapJoin
12/07/04 10:09:33 INFO exec.Task: Mapred Local Task Succeeded . Convert the Join into MapJoin
12/07/04 10:09:33 INFO exec.MapredLocalTask: Execution completed successfully
Mapred Local Task Succeeded . Convert the Join into MapJoin
12/07/04 10:09:33 INFO exec.Task: Mapred Local Task Succeeded . Convert the Join into MapJoin
Launching Job 1 out of 1
12/07/04 10:09:33 INFO ql.Driver: Launching Job 1 out of 1
Number of reduce tasks is set to 0 since there's no reduce operator
12/07/04 10:09:33 INFO exec.Task: Number of reduce tasks is set to 0 since there's no reduce operator
12/07/04 10:09:33 INFO exec.ExecDriver: Using org.apache.hadoop.hive.ql.io.CombineHiveInputFormat
12/07/04 10:09:33 INFO exec.ExecDriver: adding libjars: file:///usr/local/hyves/bin/logging/hadoop/goldmine.jar,file:///opt/hive-0.8.1-bin/lib/hive-builtins-0.8.1.jar,file:///usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar
12/07/04 10:09:33 INFO exec.ExecDriver: Archive 1 hash table files to /tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/Stage-1.tar.gz
12/07/04 10:09:33 INFO exec.ExecDriver: Upload 1 archive file from/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/Stage-1.tar.gz to: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10004/HashTable-Stage-1/Stage-1.tar.gz
12/07/04 10:09:33 INFO exec.ExecDriver: Add 1 archive file to distributed cache. Archive file: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10004/HashTable-Stage-1/Stage-1.tar.gz
12/07/04 10:09:33 INFO exec.ExecDriver: Processing alias visit_stats:visit_stats
12/07/04 10:09:33 INFO exec.ExecDriver: Adding input file hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101
12/07/04 10:09:33 INFO exec.Utilities: Content Summary not cached for hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101
12/07/04 10:09:34 INFO exec.ExecDriver: Making Temp Directory: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10002
12/07/04 10:09:34 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
12/07/04 10:09:34 INFO io.CombineHiveInputFormat: CombineHiveInputSplit creating pool for hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101; using filter path hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101
12/07/04 10:09:34 INFO mapred.FileInputFormat: Total input paths to process : 1
12/07/04 10:09:34 INFO io.CombineHiveInputFormat: number of splits 1
Starting Job = job_201205011246_5661, Tracking URL = http://hadoop-nn.internal:50030/jobdetails.jsp?jobid=job_201205011246_5661
12/07/04 10:09:34 INFO exec.Task: Starting Job = job_201205011246_5661, Tracking URL = http://hadoop-nn.internal:50030/jobdetails.jsp?jobid=job_201205011246_5661
Kill Command = /usr/lib/hadoop-0.20/bin/hadoop job -Dmapred.job.tracker=hadoop-nn.internal:9001 -kill job_201205011246_5661
12/07/04 10:09:34 INFO exec.Task: Kill Command = /usr/lib/hadoop-0.20/bin/hadoop job -Dmapred.job.tracker=hadoop-nn.internal:9001 -kill job_201205011246_5661
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
12/07/04 10:09:37 INFO exec.Task: Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
2012-07-04 10:09:37,830 Stage-1 map = 0%, reduce = 0%
12/07/04 10:09:37 INFO exec.Task: 2012-07-04 10:09:37,830 Stage-1 map = 0%, reduce = 0%
2012-07-04 10:09:41,888 Stage-1 map = 100%, reduce = 0%
12/07/04 10:09:41 INFO exec.Task: 2012-07-04 10:09:41,888 Stage-1 map = 100%, reduce = 0%
2012-07-04 10:09:43,908 Stage-1 map = 100%, reduce = 100%
12/07/04 10:09:43 INFO exec.Task: 2012-07-04 10:09:43,908 Stage-1 map = 100%, reduce = 100%
Ended Job = job_201205011246_5661
12/07/04 10:09:43 INFO exec.Task: Ended Job = job_201205011246_5661
12/07/04 10:09:43 INFO exec.FileSinkOperator: Moving tmp dir: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/_tmp.-mr-10002 to: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/_tmp.-mr-10002.intermediate
12/07/04 10:09:43 INFO exec.FileSinkOperator: Moving tmp dir: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/_tmp.-mr-10002.intermediate to: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10002
12/07/04 10:09:43 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389359843 end=1341389383927 duration=24084>
MapReduce Jobs Launched:
12/07/04 10:09:43 INFO ql.Driver: MapReduce Jobs Launched:
Job 0: Map: 1 HDFS Read: 2467969 HDFS Write: 2409082 SUCESS
12/07/04 10:09:43 INFO ql.Driver: Job 0: Map: 1 HDFS Read: 2467969 HDFS Write: 2409082 SUCESS
Total MapReduce CPU Time Spent: 0 msec
12/07/04 10:09:43 INFO ql.Driver: Total MapReduce CPU Time Spent: 0 msec
OK
12/07/04 10:09:43 INFO ql.Driver: OK
12/07/04 10:09:43 INFO ql.Driver: <PERFLOG method=releaseLocks>
12/07/04 10:09:43 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389383929 end=1341389383929 duration=0>
date_int gender part uniq sec
Time taken: 25.125 seconds
12/07/04 10:09:43 INFO CliDriver: Time taken: 25.125 seconds
SELECT
date_int,
gender,
part,
COUNT( DISTINCT member_id ),
SUM( partstats['seconds'] )
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.member_id,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.member_id)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
GROUP BY date_int, gender, part
SELECT
date_int,
gender,
part,
COUNT( DISTINCT member_id ),
SUM( partstats['seconds'] )
FROM (
SELECT
date_int,
visit_stats.member_id,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.member_id)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
GROUP BY date_int, gender, part
SELECT
date_int,
gender,
COUNT( DISTINCT member_id )
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.member_id,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.member_id)
) visit_stats
GROUP BY date_int, gender
<?php
function ascii_header($str, $char = "=", $header = "") {
$len = 70;
$str = preg_replace("/^[ ]+/m", "", $str);
$str = trim($str);
$bMultiLine = strpos($str, "\n") !== false;
if (!$bMultiLine && strlen($str) < $len-2) {
$str = " {$str} ";
}
if ($header && strlen($header) < $len-2) {
$header = " " . trim($header) . " ";
}
$ret = "";
if ($header) {
$ret .= str_pad("", $len, $char, STR_PAD_BOTH) . "\n";
$ret .= str_pad($header, $len, $char, STR_PAD_BOTH) . "\n";
}
$ret .= str_pad("", $len, $char, STR_PAD_BOTH) . "\n";
if (!$bMultiLine) {
$ret .= str_pad($str, $len, $char, STR_PAD_BOTH) . "\n";
} else {
$ret .= $str . "\n";
}
$ret .= str_pad("", $len, $char, STR_PAD_BOTH) . "\n";
return $ret;
}
function execute($qry, $init = true, $output = true) {
if ($init) {
$qry = "ADD JAR /work/hyveshive/tools/goldmine/lib/json-serde-1.1-jar-with-dependencies.jar;
USE hive_mapjoin;
{$qry}";
}
echo ascii_header($qry, "=", "execute");
$tmp = uniqid('tmp_hive_qry_');
$tmp = sys_get_temp_dir() . "/{$tmp}";
file_put_contents($tmp, $qry);
$cmd = "hive -f {$tmp}";
if ($output) {
$ret = shell_exec($cmd);
if (trim($ret)) {
echo ascii_header($ret, "-", "result");
}
} else {
shell_exec($cmd);
}
unlink($tmp);
}
if (!in_array('--nosetup', $argv)) {
execute("DROP DATABASE IF EXISTS hive_mapjoin CASCADE", false);
execute("CREATE DATABASE hive_mapjoin", false);
execute("DROP TABLE IF EXISTS members_map");
execute("CREATE TABLE members_map (
member_id INT,
gender INT,
birthdate STRING
)
ROW FORMAT
DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;");
$tmp = uniqid('tmp_hive_data_');
$tmp = sys_get_temp_dir() . "/{$tmp}";
$data = array(
implode(",", array(10101010, 1, '2000-01-01')),
implode(",", array(10101011, 1, '2000-01-01')),
implode(",", array(10101012, 2, '2000-01-01')),
implode(",", array(10101013, 2, '2000-01-01')),
implode(",", array(10101014, 2, '2000-01-01')),
);
file_put_contents($tmp, implode("\n", $data));
execute("LOAD DATA LOCAL INPATH '{$tmp}' OVERWRITE INTO TABLE members_map");
execute("DROP TABLE IF EXISTS visit_stats");
execute("CREATE TABLE visit_stats (
date_int INT,
member_id INT,
parts MAP<STRING, MAP<STRING, INT>>
)
ROW FORMAT
SERDE 'org.openx.data.jsonserde.JsonSerDe'
STORED AS TEXTFILE;");
$tmp = uniqid('tmp_hive_data_');
$tmp = sys_get_temp_dir() . "/{$tmp}";
$row = array('date_int' => 20120301, 'member_id' => 10101010, 'parts' => array('MAIN' => array('seconds' => 5)));
$data = array(
json_encode($row),
json_encode(array_merge($row, array('member_id' => 10101011))),
json_encode(array_merge($row, array('member_id' => 10101012))),
json_encode(array_merge($row, array('member_id' => 10101013))),
json_encode(array_merge($row, array('member_id' => 10101014))),
json_encode(array_merge($row, array('member_id' => 10101015))),
);
file_put_contents($tmp, implode("\n", $data));
execute("LOAD DATA LOCAL INPATH '{$tmp}' OVERWRITE INTO TABLE visit_stats");
}
/*
* this one is working properly
*/
execute("SELECT
date_int,
gender,
part,
COUNT( DISTINCT member_id ),
SUM( partstats['seconds'] )
FROM (
SELECT
date_int,
visit_stats.member_id,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.member_id)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
GROUP BY date_int, gender, part");
/*
* adding MAPJOIN, removing LATERAL VIEW
* still working
*/
execute("SELECT
date_int,
gender,
COUNT( DISTINCT member_id )
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.member_id,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.member_id)
) visit_stats
GROUP BY date_int, gender");
/*
* adding MAPJOIN, keeping LATERAL VIEW
* no output
*/
execute("SELECT
date_int,
gender,
part,
COUNT( DISTINCT member_id ),
SUM( partstats['seconds'] )
FROM (
SELECT
/*+ MAPJOIN(members_map) */
date_int,
visit_stats.member_id,
members_map.gender as gender,
parts
FROM visit_stats
JOIN members_map ON(members_map.member_id = visit_stats.member_id)
) visit_stats
LATERAL VIEW explode(parts) partsTable AS part, partstats
GROUP BY date_int, gender, part");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment