Created
April 26, 2012 13:09
-
-
Save rubensayshi/2499436 to your computer and use it in GitHub Desktop.
Hive MAPJOIN + LATERAL VIEW
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Logging initialized using configuration in jar:file:/opt/hive-0.8.1-bin/lib/hive-common-0.8.1.jar!/hive-log4j.properties | |
12/07/04 10:09:16 INFO SessionState: Logging initialized using configuration in jar:file:/opt/hive-0.8.1-bin/lib/hive-common-0.8.1.jar!/hive-log4j.properties | |
Hive history file=/tmp/hdfs/hive_job_log_hdfs_201207041009_796357994.txt | |
12/07/04 10:09:16 INFO exec.HiveHistory: Hive history file=/tmp/hdfs/hive_job_log_hdfs_201207041009_796357994.txt | |
ADD JAR /usr/local/hyves/bin/logging/hadoop/goldmine.jar | |
12/07/04 10:09:16 INFO SessionState: Added /usr/local/hyves/bin/logging/hadoop/goldmine.jar to class path | |
12/07/04 10:09:16 INFO SessionState: Added resource: /usr/local/hyves/bin/logging/hadoop/goldmine.jar | |
ADD JAR /usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar | |
12/07/04 10:09:16 INFO SessionState: Added /usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar to class path | |
12/07/04 10:09:16 INFO SessionState: Added resource: /usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar | |
CREATE TEMPORARY FUNCTION age AS 'org.hyves.hive.udf.Age' | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=compile> | |
12/07/04 10:09:16 INFO parse.ParseDriver: Parsing command: | |
CREATE TEMPORARY FUNCTION age AS 'org.hyves.hive.udf.Age' | |
12/07/04 10:09:16 INFO parse.ParseDriver: Parse Completed | |
12/07/04 10:09:16 INFO parse.FunctionSemanticAnalyzer: analyze done | |
12/07/04 10:09:16 INFO ql.Driver: Semantic Analysis Completed | |
12/07/04 10:09:16 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:null, properties:null) | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=compile start=1341389356707 end=1341389356907 duration=200> | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=Driver.execute> | |
12/07/04 10:09:16 INFO ql.Driver: Starting command: | |
CREATE TEMPORARY FUNCTION age AS 'org.hyves.hive.udf.Age' | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389356907 end=1341389356926 duration=19> | |
12/07/04 10:09:16 INFO ql.Driver: OK | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=releaseLocks> | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389356926 end=1341389356926 duration=0> | |
12/07/04 10:09:16 INFO CliDriver: Time taken: 0.22 seconds | |
CREATE TEMPORARY FUNCTION age_group AS 'org.hyves.hive.udf.AgeGroup' | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=compile> | |
12/07/04 10:09:16 INFO parse.ParseDriver: Parsing command: | |
CREATE TEMPORARY FUNCTION age_group AS 'org.hyves.hive.udf.AgeGroup' | |
12/07/04 10:09:16 INFO parse.ParseDriver: Parse Completed | |
12/07/04 10:09:16 INFO parse.FunctionSemanticAnalyzer: analyze done | |
12/07/04 10:09:16 INFO ql.Driver: Semantic Analysis Completed | |
12/07/04 10:09:16 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:null, properties:null) | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=compile start=1341389356927 end=1341389356929 duration=2> | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=Driver.execute> | |
12/07/04 10:09:16 INFO ql.Driver: Starting command: | |
CREATE TEMPORARY FUNCTION age_group AS 'org.hyves.hive.udf.AgeGroup' | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389356929 end=1341389356931 duration=2> | |
12/07/04 10:09:16 INFO ql.Driver: OK | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=releaseLocks> | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389356932 end=1341389356932 duration=0> | |
12/07/04 10:09:16 INFO CliDriver: Time taken: 0.0050 seconds | |
USE hyves_goldmine | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=compile> | |
12/07/04 10:09:16 INFO parse.ParseDriver: Parsing command: | |
USE hyves_goldmine | |
12/07/04 10:09:16 INFO parse.ParseDriver: Parse Completed | |
12/07/04 10:09:16 INFO ql.Driver: Semantic Analysis Completed | |
12/07/04 10:09:16 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:null, properties:null) | |
12/07/04 10:09:16 INFO ql.Driver: </PERFLOG method=compile start=1341389356932 end=1341389356946 duration=14> | |
12/07/04 10:09:16 INFO ql.Driver: <PERFLOG method=Driver.execute> | |
12/07/04 10:09:16 INFO ql.Driver: Starting command: | |
USE hyves_goldmine | |
12/07/04 10:09:17 INFO metastore.HiveMetaStore: 0: Opening raw store with implemenation class:org.apache.hadoop.hive.metastore.ObjectStore | |
12/07/04 10:09:17 INFO metastore.ObjectStore: ObjectStore, initialize called | |
12/07/04 10:09:17 ERROR DataNucleus.Plugin: Bundle "org.eclipse.jdt.core" requires "org.eclipse.core.resources" but it cannot be resolved. | |
12/07/04 10:09:17 ERROR DataNucleus.Plugin: Bundle "org.eclipse.jdt.core" requires "org.eclipse.core.runtime" but it cannot be resolved. | |
12/07/04 10:09:17 ERROR DataNucleus.Plugin: Bundle "org.eclipse.jdt.core" requires "org.eclipse.text" but it cannot be resolved. | |
12/07/04 10:09:18 INFO metastore.ObjectStore: Setting MetaStore object pin classes with hive.metastore.cache.pinobjtypes="Table,StorageDescriptor,SerDeInfo,Partition,Database,Type,FieldSchema,Order" | |
12/07/04 10:09:18 INFO metastore.ObjectStore: Initialized ObjectStore | |
12/07/04 10:09:18 INFO metastore.HiveMetaStore: 0: get_database: hyves_goldmine | |
12/07/04 10:09:18 INFO metastore.HiveMetaStore: 0: get_database: hyves_goldmine | |
12/07/04 10:09:18 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389356946 end=1341389358808 duration=1862> | |
12/07/04 10:09:18 INFO ql.Driver: OK | |
12/07/04 10:09:18 INFO ql.Driver: <PERFLOG method=releaseLocks> | |
12/07/04 10:09:18 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389358808 end=1341389358808 duration=0> | |
12/07/04 10:09:18 INFO CliDriver: Time taken: 1.876 seconds | |
set hive.cli.print.header=true | |
set hive.optimize.cp=false | |
SELECT | |
date_int as date_int, | |
gender as gender, | |
part as part, | |
COUNT( DISTINCT memberId ) as uniq, | |
SUM( partstats['seconds'] ) as sec | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.memberId, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.memberId) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
WHERE date_int=20120101 | |
GROUP BY date_int, gender, part | |
12/07/04 10:09:18 INFO ql.Driver: <PERFLOG method=compile> | |
12/07/04 10:09:18 INFO parse.ParseDriver: Parsing command: | |
SELECT | |
date_int as date_int, | |
gender as gender, | |
part as part, | |
COUNT( DISTINCT memberId ) as uniq, | |
SUM( partstats['seconds'] ) as sec | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.memberId, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.memberId) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
WHERE date_int=20120101 | |
GROUP BY date_int, gender, part | |
12/07/04 10:09:18 INFO parse.ParseDriver: Parse Completed | |
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Starting Semantic Analysis | |
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Completed phase 1 of Semantic Analysis | |
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Get metadata for source tables | |
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Get metadata for subqueries | |
12/07/04 10:09:18 INFO parse.SemanticAnalyzer: Get metadata for source tables | |
12/07/04 10:09:18 INFO metastore.HiveMetaStore: 0: get_table : db=hyves_goldmine tbl=visit_stats | |
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform} | |
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform} | |
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_table : db=hyves_goldmine tbl=members_map | |
12/07/04 10:09:19 INFO parse.ParseDriver: Parsing command: SELECT `member_id` AS `member_id`, `gender` AS `gender`, `birthdate` AS `birthdate` FROM (SELECT `members_map_full`.`member_id`, `members_map_full`.`gender`, `members_map_full`.`birthdate` FROM `members_map_full` WHERE `members_map_full`.`member_id` % 250 = 0) `members_map` | |
12/07/04 10:09:19 INFO parse.ParseDriver: Parse Completed | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for subqueries | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for source tables | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for subqueries | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for source tables | |
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_table : db=hyves_goldmine tbl=members_map_full | |
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate} | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for subqueries | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Get metadata for destination tables | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Completed getting MetaData in Semantic Analysis | |
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate} | |
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null | |
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null | |
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null | |
12/07/04 10:09:19 WARN parse.SemanticAnalyzer: Common Gby keys:null | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for FS(20) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(19) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for GBY(18) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for RS(17) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for GBY(16) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(15) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for FIL(14) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of FIL For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col0 = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for LVJ(13) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of LVJ For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col0 = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(10) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of SEL For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col0 = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for UDTF(12) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(11) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for LVF(9) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(8) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of SEL For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (_col6 = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for JOIN(7) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of JOIN For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (VALUE._col6 = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for RS(5) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of RS For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (date_int = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for TS(4) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of TS For Alias : visit_stats | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: (date_int = 20120101) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for RS(6) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(3) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for SEL(2) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for FIL(1) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of FIL For Alias : members_map_full | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: ((member_id % 250) = 0) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Processing for TS(0) | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: Pushdown Predicates of TS For Alias : members_map_full | |
12/07/04 10:09:19 INFO ppd.OpProcFactory: ((member_id % 250) = 0) | |
12/07/04 10:09:19 INFO ppr.PartitionPruner: <PERFLOG method=prune-listing> | |
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_partition_names : db=hyves_goldmine tbl=visit_stats | |
12/07/04 10:09:19 INFO ppr.PartitionPruner: </PERFLOG method=prune-listing start=1341389359513 end=1341389359726 duration=213> | |
12/07/04 10:09:19 INFO ppr.PartitionPruner: <PERFLOG method=partition-retrieving> | |
12/07/04 10:09:19 INFO metastore.HiveMetaStore: 0: get_partitions_by_names : db=hyves_goldmine tbl=visit_stats | |
12/07/04 10:09:19 INFO ppr.PartitionPruner: </PERFLOG method=partition-retrieving start=1341389359726 end=1341389359755 duration=29> | |
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform} | |
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform} | |
12/07/04 10:09:19 INFO hive.log: DDL: struct visit_stats { string date, i32 memberid, map<string,i32> generic, map<string,map<string,i32>> platforms, map<string,map<string,i32>> parts, map<string,map<string,map<string,i32>>> devices_by_platform} | |
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate} | |
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate} | |
12/07/04 10:09:19 INFO hive.log: DDL: struct members_map_full { i32 member_id, i32 gender, string birthdate} | |
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Looking for table scans where optimization is applicable | |
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Found 0 metadata only table scans | |
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Looking for table scans where optimization is applicable | |
12/07/04 10:09:19 INFO physical.MetadataOnlyOptimizer: Found 0 metadata only table scans | |
12/07/04 10:09:19 INFO parse.SemanticAnalyzer: Completed plan generation | |
12/07/04 10:09:19 INFO ql.Driver: Semantic Analysis Completed | |
12/07/04 10:09:19 INFO ql.Driver: Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:date_int, type:string, comment:null), FieldSchema(name:gender, type:int, comment:null), FieldSchema(name:part, type:string, comment:null), FieldSchema(name:uniq, type:bigint, comment:null), FieldSchema(name:sec, type:bigint, comment:null)], properties:null) | |
12/07/04 10:09:19 INFO ql.Driver: </PERFLOG method=compile start=1341389358811 end=1341389359843 duration=1032> | |
12/07/04 10:09:19 INFO ql.Driver: <PERFLOG method=Driver.execute> | |
12/07/04 10:09:19 INFO ql.Driver: Starting command: | |
SELECT | |
date_int as date_int, | |
gender as gender, | |
part as part, | |
COUNT( DISTINCT memberId ) as uniq, | |
SUM( partstats['seconds'] ) as sec | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.memberId, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.memberId) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
WHERE date_int=20120101 | |
GROUP BY date_int, gender, part | |
Total MapReduce jobs = 1 | |
12/07/04 10:09:19 INFO ql.Driver: Total MapReduce jobs = 1 | |
12/07/04 10:09:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
12/07/04 10:09:19 INFO exec.MapredLocalTask: Generating plan file file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10005/plan.xml | |
12/07/04 10:09:20 INFO exec.MapredLocalTask: Executing: /usr/lib/hadoop-0.20/bin/hadoop jar /opt/hive-0.8.1-bin/lib/hive-exec-0.8.1.jar org.apache.hadoop.hive.ql.exec.ExecDriver -localtask -plan file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10005/plan.xml -jobconf hive.stats.jdbcdriver=org.apache.derby.jdbc.EmbeddedDriver -jobconf hive.mapjoin.followby.gby.localtask.max.memory.usage=0.55 -jobconf hive.limit.optimize.limit.file=10 -jobconf hive.metastore.archive.intermediate.archived=_INTERMEDIATE_ARCHIVED -jobconf datanucleus.validateColumns=false -jobconf hive.mapred.partitioner=org.apache.hadoop.hive.ql.io.DefaultHivePartitioner -jobconf hive.cli.errors.ignore=false -jobconf hive.downloaded.resources.dir=%2Ftmp%2Fhdfs%2Fhive_resources -jobconf hive.lockmgr.zookeeper.default.partition.name=__HIVE_DEFAULT_ZOOKEEPER_PARTITION__ -jobconf hive.zookeeper.session.timeout=600000 -jobconf datanucleus.connectionPoolingType=DBCP -jobconf hive.variable.substitute=true -jobconf dfs.namenode.decommission.nodes.per.interval=5 -jobconf dfs.https.need.client.auth=false -jobconf hive.stats.dbclass=jdbc%3Aderby -jobconf hive.ppd.remove.duplicatefilters=true -jobconf hive.lock.mapred.only.operation=false -jobconf dfs.datanode.data.dir.perm=700 -jobconf hive.script.serde=org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe -jobconf dfs.datanode.address=0.0.0.0%3A50010 -jobconf dfs.namenode.logging.level=info -jobconf dfs.block.access.token.enable=false -jobconf hive.merge.smallfiles.avgsize=16000000 -jobconf datanucleus.cache.level2=false -jobconf hive.hbase.wal.enabled=true -jobconf dfs.safemode.threshold.pct=0.999f -jobconf dfs.blockreport.initialDelay=0 -jobconf dfs.namenode.handler.count=10 -jobconf hive.security.authorization.manager=org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider -jobconf hive.merge.current.job.has.dynamic.partitions=false -jobconf hive.metastore.rawstore.impl=org.apache.hadoop.hive.metastore.ObjectStore -jobconf hive.metastore.local=true -jobconf hive.optimize.bucketmapjoin=false -jobconf hive.optimize.ppd.storage=true -jobconf hive.querylog.location=%2Ftmp%2Fhdfs -jobconf hive.limit.row.max.size=100000 -jobconf dfs.block.access.key.update.interval=600 -jobconf hive.rework.mapredwork=false -jobconf dfs.safemode.extension=30000 -jobconf hive.index.compact.query.max.entries=10000000 -jobconf hive.metastore.authorization.storage.checks=false -jobconf hive.autogen.columnalias.prefix.includefuncname=false -jobconf hive.zookeeper.namespace=hive_zookeeper_namespace -jobconf hive.test.mode.prefix=test_ -jobconf hive.merge.rcfile.block.level=true -jobconf hive.test.mode=false -jobconf hive.exec.compress.intermediate=false -jobconf datanucleus.cache.level2.type=SOFT -jobconf dfs.https.server.keystore.resource=ssl-server.xml -jobconf hive.metastore.ds.retry.attempts=1 -jobconf hive.limit.optimize.enable=false -jobconf hive.zookeeper.client.port=2181 -jobconf dfs.block.access.token.lifetime=600 -jobconf hive.exec.perf.logger=org.apache.hadoop.hive.ql.log.PerfLogger -jobconf javax.jdo.option.ConnectionUserName=hadoop -jobconf dfs.name.edits.dir=%24%7Bdfs.name.dir%7D -jobconf hive.merge.mapfiles=true -jobconf hive.test.mode.samplefreq=32 -jobconf hive.optimize.skewjoin=false -jobconf hive.optimize.index.groupby=false -jobconf hive.metastore.server.min.threads=200 -jobconf hive.mapjoin.localtask.max.memory.usage=0.9 -jobconf dfs.block.size=67108864 -jobconf hive.map.aggr.hash.min.reduction=0.5 -jobconf hive.exec.compress.output=false -jobconf dfs.datanode.ipc.address=0.0.0.0%3A50020 -jobconf javax.jdo.option.Multithreaded=true -jobconf hive.script.recordreader=org.apache.hadoop.hive.ql.exec.TextRecordReader -jobconf dfs.permissions=true -jobconf hive.multigroupby.singlemr=false -jobconf hive.lock.numretries=100 -jobconf hive.optimize.metadataonly=true -jobconf hive.exec.parallel.thread.number=8 -jobconf hive.exec.default.partition.name=__HIVE_DEFAULT_PARTITION__ -jobconf hive.exec.max.created.files=100000 -jobconf hive.archive.har.parentdir.settable=false -jobconf hive.metastore.event.clean.freq=0 -jobconf dfs.datanode.https.address=0.0.0.0%3A50475 -jobconf hive.exec.mode.local.auto=false -jobconf dfs.secondary.http.address=0.0.0.0%3A50090 -jobconf hive.optimize.index.filter=false -jobconf datanucleus.storeManagerType=rdbms -jobconf dfs.replication.max=512 -jobconf hive.script.operator.id.env.var=HIVE_SCRIPT_OPERATOR_ID -jobconf hive.exec.mode.local.auto.inputbytes.max=134217728 -jobconf mapred.min.split.size=1 -jobconf dfs.namenode.delegation.token.renew-interval=86400000 -jobconf hive.mapjoin.size.key=10000 -jobconf hive.metastore.ds.retry.interval=1000 -jobconf hive.skewjoin.mapjoin.min.split=33554432 -jobconf hive.metastore.client.connect.retry.delay=1 -jobconf hive.auto.convert.join=false -jobconf dfs.https.client.keystore.resource=ssl-client.xml -jobconf hive.metastore.warehouse.dir=%2Fuser%2Fhive%2Fwarehouse -jobconf hive.mapjoin.bucket.cache.size=100 -jobconf hive.exec.job.debug.timeout=30000 -jobconf datanucleus.transactionIsolation=read-committed -jobconf hive.stats.jdbc.timeout=30 -jobconf hive.mergejob.maponly=true -jobconf dfs.https.address=0.0.0.0%3A50470 -jobconf dfs.balance.bandwidthPerSec=1048576 -jobconf hive.fetch.output.serde=org.apache.hadoop.hive.serde2.DelimitedJSONSerDe -jobconf hive.root.logger=INFO%2Cconsole -jobconf hive.exec.script.trust=false -jobconf hive.mapjoin.followby.map.aggr.hash.percentmemory=0.3 -jobconf hive.exim.uri.scheme.whitelist=hdfs%2Cpfile -jobconf hive.stats.dbconnectionstring=jdbc%3Aderby%3A%3BdatabaseName%3DTempStatsStore%3Bcreate%3Dtrue -jobconf mapred.reduce.tasks=-1 -jobconf hive.optimize.index.filter.compact.minsize=5368709120 -jobconf hive.skewjoin.key=100000 -jobconf javax.jdo.option.ConnectionDriverName=com.mysql.jdbc.Driver -jobconf hive.security.authenticator.manager=org.apache.hadoop.hive.ql.security.HadoopDefaultAuthenticator -jobconf dfs.max.objects=0 -jobconf mapred.input.dir.recursive=false -jobconf hive.udtf.auto.progress=false -jobconf hive.session.id=hdfs_201207041009 -jobconf mapred.job.name= -jobconf dfs.datanode.dns.nameserver=default -jobconf hive.exec.script.maxerrsize=100000 -jobconf dfs.blockreport.intervalMsec=3600000 -jobconf hive.optimize.groupby=true -jobconf datanucleus.plugin.pluginRegistryBundleCheck=LOG -jobconf hive.exec.rowoffset=false -jobconf hive.default.fileformat=TextFile -jobconf hive.hadoop.supports.splittable.combineinputformat=false -jobconf hive.metastore.archive.intermediate.original=_INTERMEDIATE_ORIGINAL -jobconf hive.mapjoin.smalltable.filesize=25000000 -jobconf hive.exec.scratchdir=%2Ftmp%2Fhive-hdfs -jobconf datanucleus.identifierFactory=datanucleus -jobconf hive.exec.max.dynamic.partitions.pernode=100 -jobconf hive.stats.retries.max=0 -jobconf dfs.client.block.write.retries=3 -jobconf hive.join.emit.interval=1000 -jobconf hive.script.recordwriter=org.apache.hadoop.hive.ql.exec.TextRecordWriter -jobconf datanucleus.validateConstraints=false -jobconf hive.exec.dynamic.partition=false -jobconf hive.hashtable.loadfactor=0.75 -jobconf dfs.https.enable=false -jobconf hive.sample.seednumber=0 -jobconf dfs.datanode.failed.volumes.tolerated=0 -jobconf hive.optimize.index.filter.compact.maxsize=-1 -jobconf hive.metastore.client.socket.timeout=20 -jobconf hive.map.aggr.hash.force.flush.memory.threshold=0.9 -jobconf dfs.namenode.delegation.key.update-interval=86400000 -jobconf hive.exec.show.job.failure.debug.info=true -jobconf hive.join.cache.size=25000 -jobconf hive.mapper.cannot.span.multiple.partitions=false -jobconf dfs.default.chunk.view.size=32768 -jobconf datanucleus.autoStartMechanismMode=checked -jobconf mapred.min.split.size.per.node=1 -jobconf hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat -jobconf hive.skewjoin.mapjoin.map.tasks=10000 -jobconf hive.index.compact.query.max.size=10737418240 -jobconf hive.exec.drop.ignorenonexistent=true -jobconf hive.hwi.listen.port=9999 -jobconf hive.session.silent=false -jobconf hive.cli.print.header=true -jobconf dfs.datanode.du.reserved=0 -jobconf hadoop.bin.path=%2Fusr%2Flib%2Fhadoop-0.20%2Fbin%2Fhadoop -jobconf dfs.web.ugi=webuser%2Cwebgroup -jobconf dfs.df.interval=60000 -jobconf hive.groupby.mapaggr.checkinterval=100000 -jobconf mapred.min.split.size.per.rack=1 -jobconf dfs.data.dir=%24%7Bdfs.name.dir%7D%2Fdata -jobconf hive.metastore.server.max.threads=100000 -jobconf dfs.datanode.dns.interface=default -jobconf hive.metastore.kerberos.principal=hive-metastore%2F_HOST%40EXAMPLE.COM -jobconf javax.jdo.option.ConnectionURL=jdbc%3Amysql%3A%2F%2Flocalhost%3A3306%2Fhive%3FcreateDatabaseIfNotExist%3Dtrue -jobconf hive.mapred.mode=nonstrict -jobconf hive.metastore.batch.retrieve.max=300 -jobconf hive.outerjoin.supports.filters=true -jobconf hive.groupby.skewindata=false -jobconf dfs.support.append=true -jobconf hive.exec.mode.local.auto.tasks.max=4 -jobconf hive.exec.parallel=false -jobconf hive.debug.localtask=false -jobconf dfs.permissions.supergroup=supergroup -jobconf hive.zookeeper.clean.extra.nodes=false -jobconf dfs.replication.min=1 -jobconf hive.merge.size.per.task=256000000 -jobconf fs.har.impl=org.apache.hadoop.hive.shims.HiveHarFileSystem -jobconf hive.unlock.numretries=10 -jobconf hive.exec.reducers.max=999 -jobconf hive.cli.print.current.db=false -jobconf hive.exec.tasklog.debug.timeout=20000 -jobconf dfs.namenode.decommission.interval=30 -jobconf hive.metastore.archive.intermediate.extracted=_INTERMEDIATE_EXTRACTED -jobconf hive.metastore.execute.setugi=false -jobconf dfs.http.address=hadoop-nn.internal%3A50070 -jobconf hive.query.string=%0ASELECT%0A++++date_int+as+date_int%2C%0A++++gender+as+gender%2C%0A++++part+as+part%2C%0A++++COUNT%28+DISTINCT+memberId+%29+as+uniq%2C%0A++++SUM%28+partstats%5B%27seconds%27%5D+%29+as+sec%0AFROM+%28%0A++++SELECT%0A++++++++%2F*%2B+MAPJOIN%28members_map%29+*%2F%0A++++++++date_int%2C%0A++++++++visit_stats.memberId%2C%0A++++++++members_map.gender+as+gender%2C%0A++++++++parts%0A++++FROM+visit_stats%0A++++JOIN+members_map+ON%28members_map.member_id+%3D+visit_stats.memberId%29%0A%29+visit_stats%0ALATERAL+VIEW+explode%28parts%29+partsTable+AS+part%2C+partstats%0AWHERE+date_int%3D20120101%0AGROUP+BY+date_int%2C+gender%2C+part -jobconf dfs.heartbeat.interval=3 -jobconf hive.stats.collect.rawdatasize=true -jobconf hive.optimize.ppd=true -jobconf dfs.name.dir=%2Fvar%2Flib%2Fhadoop -jobconf hive.stats.retries.wait=3000 -jobconf hive.metastore.cache.pinobjtypes=Table%2CStorageDescriptor%2CSerDeInfo%2CPartition%2CDatabase%2CType%2CFieldSchema%2COrder -jobconf hive.metastore.fs.handler.class=org.apache.hadoop.hive.metastore.HiveMetaStoreFsImpl -jobconf dfs.datanode.http.address=0.0.0.0%3A50075 -jobconf hive.mapjoin.cache.numrows=25000 -jobconf hive.merge.mapredfiles=false -jobconf hive.metastore.sasl.enabled=false -jobconf hive.start.cleanup.scratchdir=false -jobconf hive.fileformat.check=true -jobconf hive.metastore.connect.retries=5 -jobconf hive.jobname.length=50 -jobconf hive.query.id=hdfs_20120704100909_28d0a9d7-3a7d-484c-9906-7783ebf5f9d1 -jobconf javax.jdo.option.DetachAllOnCommit=true -jobconf javax.jdo.option.ConnectionPassword=hadoop -jobconf hive.index.compact.file.ignore.hdfs=false -jobconf hive.hwi.listen.host=0.0.0.0 -jobconf hive.optimize.reducededuplication=true -jobconf hive.optimize.index.autoupdate=false -jobconf dfs.replication.interval=3 -jobconf hive.auto.progress.timeout=0 -jobconf hive.ppd.recognizetransivity=true -jobconf hive.metastore.force.reload.conf=false -jobconf hive.map.aggr.hash.percentmemory=0.5 -jobconf hive.mapjoin.check.memory.rows=100000 -jobconf javax.jdo.PersistenceManagerFactoryClass=org.datanucleus.jdo.JDOPersistenceManagerFactory -jobconf dfs.replication=3 -jobconf hive.exec.concatenate.check.index=true -jobconf hive.mapred.local.mem=0 -jobconf datanucleus.validateTables=false -jobconf hive.exec.max.dynamic.partitions=1000 -jobconf hive.merge.input.format.block.level=org.apache.hadoop.hive.ql.io.rcfile.merge.RCFileBlockMergeInputFormat -jobconf hive.insert.into.multilevel.dirs=false -jobconf hive.exec.submitviachild=false -jobconf hive.exec.dynamic.partition.mode=strict -jobconf mapred.max.split.size=256000000 -jobconf hive.script.auto.progress=false -jobconf hive.lock.sleep.between.retries=60 -jobconf hive.enforce.sorting=false -jobconf datanucleus.autoCreateSchema=true -jobconf hive.task.progress=false -jobconf hive.index.compact.binary.search=true -jobconf hive.metastore.server.tcp.keepalive=true -jobconf hive.hashtable.initialCapacity=100000 -jobconf hive.lock.manager=org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLockManager -jobconf hive.stats.autogather=true -jobconf dfs.access.time.precision=3600000 -jobconf hive.map.aggr=true -jobconf hadoop.config.dir=%2Fetc%2Fhadoop-0.20%2Fconf -jobconf hive.enforce.bucketing=false -jobconf hive.limit.optimize.fetch.max=50000 -jobconf dfs.safemode.min.datanodes=0 -jobconf hive.autogen.columnalias.prefix.label=_c -jobconf hive.exec.script.allow.partial.consumption=false -jobconf hive.archive.enabled=false -jobconf hive.optimize.bucketmapjoin.sortedmerge=false -jobconf hive.support.concurrency=false -jobconf hive.mapred.reduce.tasks.speculative.execution=true -jobconf hive.heartbeat.interval=1000 -jobconf hive.stats.atomic=false -jobconf hive.exec.counters.pull.interval=1000 -jobconf hive.metastore.event.expiry.duration=0 -jobconf hive.query.result.fileformat=TextFile -jobconf hive.optimize.cp=false -jobconf javax.jdo.option.NonTransactionalRead=true -jobconf dfs.datanode.handler.count=3 -jobconf hive.exec.reducers.bytes.per.reducer=1000000000 -jobconf dfs.replication.considerLoad=true -jobconf dfs.namenode.delegation.token.max-lifetime=604800000 -jobconf hive.error.on.empty.partition=false -jobconf hive.security.authorization.enabled=false | |
Execution log at: | |
2012-07-04 10:09:21 Starting to launch local task to process map join; maximum memory = 932118528 | |
2012-07-04 10:09:31 Processing rows: 46671 Hashtable size: 46671 Memory usage: 21544424 rate: 0.023 | |
2012-07-04 10:09:31 Dump the hashtable into file: file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/MapJoin-members_map-01--.hashtable | |
2012-07-04 10:09:33 Upload 1 File to: file:/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/MapJoin-members_map-01--.hashtable File size: 2800256 | |
2012-07-04 10:09:33 End of local task; Time Taken: 11.962 sec. | |
Execution completed successfully | |
12/07/04 10:09:33 INFO exec.Task: Execution completed successfully | |
Mapred Local Task Succeeded . Convert the Join into MapJoin | |
12/07/04 10:09:33 INFO exec.Task: Mapred Local Task Succeeded . Convert the Join into MapJoin | |
12/07/04 10:09:33 INFO exec.MapredLocalTask: Execution completed successfully | |
Mapred Local Task Succeeded . Convert the Join into MapJoin | |
12/07/04 10:09:33 INFO exec.Task: Mapred Local Task Succeeded . Convert the Join into MapJoin | |
Launching Job 1 out of 1 | |
12/07/04 10:09:33 INFO ql.Driver: Launching Job 1 out of 1 | |
Number of reduce tasks is set to 0 since there's no reduce operator | |
12/07/04 10:09:33 INFO exec.Task: Number of reduce tasks is set to 0 since there's no reduce operator | |
12/07/04 10:09:33 INFO exec.ExecDriver: Using org.apache.hadoop.hive.ql.io.CombineHiveInputFormat | |
12/07/04 10:09:33 INFO exec.ExecDriver: adding libjars: file:///usr/local/hyves/bin/logging/hadoop/goldmine.jar,file:///opt/hive-0.8.1-bin/lib/hive-builtins-0.8.1.jar,file:///usr/local/hyves/bin/logging/hadoop/json-serde-1.1-jar-with-dependencies.jar | |
12/07/04 10:09:33 INFO exec.ExecDriver: Archive 1 hash table files to /tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/Stage-1.tar.gz | |
12/07/04 10:09:33 INFO exec.ExecDriver: Upload 1 archive file from/tmp/hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-local-10003/HashTable-Stage-1/Stage-1.tar.gz to: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10004/HashTable-Stage-1/Stage-1.tar.gz | |
12/07/04 10:09:33 INFO exec.ExecDriver: Add 1 archive file to distributed cache. Archive file: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10004/HashTable-Stage-1/Stage-1.tar.gz | |
12/07/04 10:09:33 INFO exec.ExecDriver: Processing alias visit_stats:visit_stats | |
12/07/04 10:09:33 INFO exec.ExecDriver: Adding input file hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101 | |
12/07/04 10:09:33 INFO exec.Utilities: Content Summary not cached for hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101 | |
12/07/04 10:09:34 INFO exec.ExecDriver: Making Temp Directory: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10002 | |
12/07/04 10:09:34 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. | |
12/07/04 10:09:34 INFO io.CombineHiveInputFormat: CombineHiveInputSplit creating pool for hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101; using filter path hdfs://hadoop-nn.internal:9000/user/hive/warehouse/hyves_goldmine.db/visit_stats/date_int=20120101 | |
12/07/04 10:09:34 INFO mapred.FileInputFormat: Total input paths to process : 1 | |
12/07/04 10:09:34 INFO io.CombineHiveInputFormat: number of splits 1 | |
Starting Job = job_201205011246_5661, Tracking URL = http://hadoop-nn.internal:50030/jobdetails.jsp?jobid=job_201205011246_5661 | |
12/07/04 10:09:34 INFO exec.Task: Starting Job = job_201205011246_5661, Tracking URL = http://hadoop-nn.internal:50030/jobdetails.jsp?jobid=job_201205011246_5661 | |
Kill Command = /usr/lib/hadoop-0.20/bin/hadoop job -Dmapred.job.tracker=hadoop-nn.internal:9001 -kill job_201205011246_5661 | |
12/07/04 10:09:34 INFO exec.Task: Kill Command = /usr/lib/hadoop-0.20/bin/hadoop job -Dmapred.job.tracker=hadoop-nn.internal:9001 -kill job_201205011246_5661 | |
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 | |
12/07/04 10:09:37 INFO exec.Task: Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 | |
2012-07-04 10:09:37,830 Stage-1 map = 0%, reduce = 0% | |
12/07/04 10:09:37 INFO exec.Task: 2012-07-04 10:09:37,830 Stage-1 map = 0%, reduce = 0% | |
2012-07-04 10:09:41,888 Stage-1 map = 100%, reduce = 0% | |
12/07/04 10:09:41 INFO exec.Task: 2012-07-04 10:09:41,888 Stage-1 map = 100%, reduce = 0% | |
2012-07-04 10:09:43,908 Stage-1 map = 100%, reduce = 100% | |
12/07/04 10:09:43 INFO exec.Task: 2012-07-04 10:09:43,908 Stage-1 map = 100%, reduce = 100% | |
Ended Job = job_201205011246_5661 | |
12/07/04 10:09:43 INFO exec.Task: Ended Job = job_201205011246_5661 | |
12/07/04 10:09:43 INFO exec.FileSinkOperator: Moving tmp dir: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/_tmp.-mr-10002 to: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/_tmp.-mr-10002.intermediate | |
12/07/04 10:09:43 INFO exec.FileSinkOperator: Moving tmp dir: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/_tmp.-mr-10002.intermediate to: hdfs://hadoop-nn.internal:9000/tmp/hive-hdfs/hive_2012-07-04_10-09-18_812_8295196618988484813/-mr-10002 | |
12/07/04 10:09:43 INFO ql.Driver: </PERFLOG method=Driver.execute start=1341389359843 end=1341389383927 duration=24084> | |
MapReduce Jobs Launched: | |
12/07/04 10:09:43 INFO ql.Driver: MapReduce Jobs Launched: | |
Job 0: Map: 1 HDFS Read: 2467969 HDFS Write: 2409082 SUCESS | |
12/07/04 10:09:43 INFO ql.Driver: Job 0: Map: 1 HDFS Read: 2467969 HDFS Write: 2409082 SUCESS | |
Total MapReduce CPU Time Spent: 0 msec | |
12/07/04 10:09:43 INFO ql.Driver: Total MapReduce CPU Time Spent: 0 msec | |
OK | |
12/07/04 10:09:43 INFO ql.Driver: OK | |
12/07/04 10:09:43 INFO ql.Driver: <PERFLOG method=releaseLocks> | |
12/07/04 10:09:43 INFO ql.Driver: </PERFLOG method=releaseLocks start=1341389383929 end=1341389383929 duration=0> | |
date_int gender part uniq sec | |
Time taken: 25.125 seconds | |
12/07/04 10:09:43 INFO CliDriver: Time taken: 25.125 seconds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT | |
date_int, | |
gender, | |
part, | |
COUNT( DISTINCT member_id ), | |
SUM( partstats['seconds'] ) | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.member_id, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.member_id) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
GROUP BY date_int, gender, part |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT | |
date_int, | |
gender, | |
part, | |
COUNT( DISTINCT member_id ), | |
SUM( partstats['seconds'] ) | |
FROM ( | |
SELECT | |
date_int, | |
visit_stats.member_id, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.member_id) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
GROUP BY date_int, gender, part |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SELECT | |
date_int, | |
gender, | |
COUNT( DISTINCT member_id ) | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.member_id, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.member_id) | |
) visit_stats | |
GROUP BY date_int, gender |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function ascii_header($str, $char = "=", $header = "") { | |
$len = 70; | |
$str = preg_replace("/^[ ]+/m", "", $str); | |
$str = trim($str); | |
$bMultiLine = strpos($str, "\n") !== false; | |
if (!$bMultiLine && strlen($str) < $len-2) { | |
$str = " {$str} "; | |
} | |
if ($header && strlen($header) < $len-2) { | |
$header = " " . trim($header) . " "; | |
} | |
$ret = ""; | |
if ($header) { | |
$ret .= str_pad("", $len, $char, STR_PAD_BOTH) . "\n"; | |
$ret .= str_pad($header, $len, $char, STR_PAD_BOTH) . "\n"; | |
} | |
$ret .= str_pad("", $len, $char, STR_PAD_BOTH) . "\n"; | |
if (!$bMultiLine) { | |
$ret .= str_pad($str, $len, $char, STR_PAD_BOTH) . "\n"; | |
} else { | |
$ret .= $str . "\n"; | |
} | |
$ret .= str_pad("", $len, $char, STR_PAD_BOTH) . "\n"; | |
return $ret; | |
} | |
function execute($qry, $init = true, $output = true) { | |
if ($init) { | |
$qry = "ADD JAR /work/hyveshive/tools/goldmine/lib/json-serde-1.1-jar-with-dependencies.jar; | |
USE hive_mapjoin; | |
{$qry}"; | |
} | |
echo ascii_header($qry, "=", "execute"); | |
$tmp = uniqid('tmp_hive_qry_'); | |
$tmp = sys_get_temp_dir() . "/{$tmp}"; | |
file_put_contents($tmp, $qry); | |
$cmd = "hive -f {$tmp}"; | |
if ($output) { | |
$ret = shell_exec($cmd); | |
if (trim($ret)) { | |
echo ascii_header($ret, "-", "result"); | |
} | |
} else { | |
shell_exec($cmd); | |
} | |
unlink($tmp); | |
} | |
if (!in_array('--nosetup', $argv)) { | |
execute("DROP DATABASE IF EXISTS hive_mapjoin CASCADE", false); | |
execute("CREATE DATABASE hive_mapjoin", false); | |
execute("DROP TABLE IF EXISTS members_map"); | |
execute("CREATE TABLE members_map ( | |
member_id INT, | |
gender INT, | |
birthdate STRING | |
) | |
ROW FORMAT | |
DELIMITED FIELDS TERMINATED BY ',' | |
STORED AS TEXTFILE;"); | |
$tmp = uniqid('tmp_hive_data_'); | |
$tmp = sys_get_temp_dir() . "/{$tmp}"; | |
$data = array( | |
implode(",", array(10101010, 1, '2000-01-01')), | |
implode(",", array(10101011, 1, '2000-01-01')), | |
implode(",", array(10101012, 2, '2000-01-01')), | |
implode(",", array(10101013, 2, '2000-01-01')), | |
implode(",", array(10101014, 2, '2000-01-01')), | |
); | |
file_put_contents($tmp, implode("\n", $data)); | |
execute("LOAD DATA LOCAL INPATH '{$tmp}' OVERWRITE INTO TABLE members_map"); | |
execute("DROP TABLE IF EXISTS visit_stats"); | |
execute("CREATE TABLE visit_stats ( | |
date_int INT, | |
member_id INT, | |
parts MAP<STRING, MAP<STRING, INT>> | |
) | |
ROW FORMAT | |
SERDE 'org.openx.data.jsonserde.JsonSerDe' | |
STORED AS TEXTFILE;"); | |
$tmp = uniqid('tmp_hive_data_'); | |
$tmp = sys_get_temp_dir() . "/{$tmp}"; | |
$row = array('date_int' => 20120301, 'member_id' => 10101010, 'parts' => array('MAIN' => array('seconds' => 5))); | |
$data = array( | |
json_encode($row), | |
json_encode(array_merge($row, array('member_id' => 10101011))), | |
json_encode(array_merge($row, array('member_id' => 10101012))), | |
json_encode(array_merge($row, array('member_id' => 10101013))), | |
json_encode(array_merge($row, array('member_id' => 10101014))), | |
json_encode(array_merge($row, array('member_id' => 10101015))), | |
); | |
file_put_contents($tmp, implode("\n", $data)); | |
execute("LOAD DATA LOCAL INPATH '{$tmp}' OVERWRITE INTO TABLE visit_stats"); | |
} | |
/* | |
* this one is working properly | |
*/ | |
execute("SELECT | |
date_int, | |
gender, | |
part, | |
COUNT( DISTINCT member_id ), | |
SUM( partstats['seconds'] ) | |
FROM ( | |
SELECT | |
date_int, | |
visit_stats.member_id, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.member_id) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
GROUP BY date_int, gender, part"); | |
/* | |
* adding MAPJOIN, removing LATERAL VIEW | |
* still working | |
*/ | |
execute("SELECT | |
date_int, | |
gender, | |
COUNT( DISTINCT member_id ) | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.member_id, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.member_id) | |
) visit_stats | |
GROUP BY date_int, gender"); | |
/* | |
* adding MAPJOIN, keeping LATERAL VIEW | |
* no output | |
*/ | |
execute("SELECT | |
date_int, | |
gender, | |
part, | |
COUNT( DISTINCT member_id ), | |
SUM( partstats['seconds'] ) | |
FROM ( | |
SELECT | |
/*+ MAPJOIN(members_map) */ | |
date_int, | |
visit_stats.member_id, | |
members_map.gender as gender, | |
parts | |
FROM visit_stats | |
JOIN members_map ON(members_map.member_id = visit_stats.member_id) | |
) visit_stats | |
LATERAL VIEW explode(parts) partsTable AS part, partstats | |
GROUP BY date_int, gender, part"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment