Last active
September 12, 2017 09:45
-
-
Save f-ewald/b2a70b06537d532fe475 to your computer and use it in GitHub Desktop.
Ex06
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}' | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
SELECT hashtag.object, COUNT(*) as amount | |
FROM | |
${hiveconf:table} t1 | |
JOIN | |
${hiveconf:table} t2 | |
ON | |
t1.subject = t2.subject | |
JOIN | |
${hiveconf:table} hashtag | |
ON | |
t2.subject = hashtag.subject | |
WHERE | |
t1.predicate == 'a' AND | |
t1.object == 'sib:Post' AND | |
t2.predicate == 'dc:created' AND | |
regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND | |
hashtag.predicate == 'sib:hashtag' | |
GROUP BY | |
hashtag.object | |
ORDER BY | |
amount DESC | |
LIMIT 10; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This is just for the file output | |
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}' | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
-- First select all Posts, then join by the date created to filter between these dates. | |
-- Then join the post by its content (any content) and by all of its hashtags. | |
-- If either the content or the hashtag or both is equal to the keyword include in output | |
-- GROUP BY subject to output only the post id. | |
SELECT t1.subject | |
FROM | |
${hiveconf:table} t1 | |
JOIN | |
${hiveconf:table} t2 | |
ON | |
t1.subject = t2.subject | |
JOIN | |
${hiveconf:table} content | |
ON | |
t2.subject = content.subject | |
LEFT OUTER JOIN | |
(SELECT * FROM ${hiveconf:table} WHERE predicate == 'sib:hashtag') hashtag | |
ON | |
t2.subject = hashtag.subject | |
WHERE | |
t1.predicate == 'a' AND | |
t1.object == 'sib:Post' AND | |
t2.predicate == 'dc:created' AND | |
regexp_extract(t2.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND | |
content.predicate == 'sioc:content' AND | |
(hashtag.object LIKE '%${hiveconf:hashtag}' OR content.object LIKE '%${hiveconf:hashtag}%') | |
GROUP BY | |
t1.subject |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This is just for the file output | |
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}' | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
-- We go the other way round, we select all the users which are NOT IN the active user set | |
-- this set is determined by the posts which are created at this date range. | |
SELECT t1.subject FROM ${hiveconf:table} t1 WHERE object == 'sioc:user' AND t1.subject | |
NOT IN | |
(SELECT | |
t12.subject | |
FROM | |
${hiveconf:table} t11 | |
JOIN | |
${hiveconf:table} t12 | |
ON | |
t11.subject = t12.object | |
JOIN | |
${hiveconf:table} t_date | |
ON | |
t11.subject = t_date.subject | |
JOIN | |
${hiveconf:table} registration | |
ON | |
t12.subject = registration.subject | |
WHERE | |
t11.predicate == 'a' AND | |
t11.object == 'sib:Post' AND | |
t12.predicate == 'sioc:creator_of' AND | |
t_date.predicate == 'dc:created' AND | |
regexp_extract(t_date.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) BETWEEN '${hiveconf:start_date}' AND '${hiveconf:stop_date}' AND | |
regexp_extract(registration.object, '"([0-9]{4}-[0-9]{2}-[0-9]{2})T', 1) < '${hiveconf:startDate}' | |
GROUP BY | |
t12.subject); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}' | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
SELECT * FROM ( | |
SELECT | |
t_friend.subject, COUNT(*) as amount_in | |
FROM | |
${hiveconf:table} t_user | |
JOIN | |
${hiveconf:table} t_friend | |
ON | |
t_user.subject = t_friend.subject | |
WHERE | |
t_user.predicate == 'a' AND | |
t_user.object == 'sib:User' AND | |
t_friend.predicate == 'foaf:knows' AND | |
t_friend.object IN ( | |
SELECT | |
t_grp.object | |
FROM | |
${hiveconf:table} t_grp | |
WHERE | |
subject == '${hiveconf:group}' AND | |
predicate == 'sioc:has_member') | |
GROUP BY | |
t_friend.subject | |
HAVING | |
COUNT(*) >= 3 | |
) as res | |
WHERE | |
res.subject NOT IN ( | |
SELECT | |
object | |
FROM | |
${hiveconf:table} | |
WHERE | |
subject == '${hiveconf:group}' AND | |
predicate == 'sioc:has_member' | |
) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INSERT OVERWRITE LOCAL DIRECTORY '${hiveconf:output_file}' | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
SELECT likes.object, likes.amount, comments.amount, (likes.amount/comments.amount) as ratio FROM (SELECT | |
likes.object, COUNT(*) as amount | |
FROM | |
${hiveconf:table} likes | |
WHERE | |
predicate == 'sib:like' | |
GROUP BY | |
likes.object) as likes | |
JOIN | |
(SELECT subject, COUNT(*) as amount FROM ${hiveconf:table} WHERE predicate == 'sioc:container_of' GROUP BY subject) as comments | |
ON | |
likes.object = comments.subject | |
ORDER BY | |
ratio ASC | |
LIMIT 50 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REGISTER RDFStorage.jar ; | |
-- /home/cloudera/Downloads/input/sibdataset200.nt | |
indata = LOAD '$input_file' USING RDFStorage() AS (s:chararray,p:chararray,o:chararray) ; | |
likes = FILTER indata BY p == 'sib:like'; | |
likes_group = GROUP likes BY o; | |
likes_grouped_count = FOREACH likes_group GENERATE group AS object, COUNT(likes) AS amount; | |
comments = FILTER indata BY p == 'sioc:container_of'; | |
comments_grouped = GROUP comments BY s; | |
comments_grouped_count = FOREACH comments_grouped GENERATE group AS subject, COUNT(comments) AS amount; | |
combined = JOIN likes_grouped_count BY object, comments_grouped_count BY subject; | |
combined_ratio = FOREACH combined GENERATE likes_grouped_count::object AS post, (float)likes_grouped_count::amount/(float)comments_grouped_count::amount/1f AS ratio; | |
combined_ratio_sorted = ORDER combined_ratio BY ratio ASC; | |
outdata = LIMIT combined_ratio_sorted 50; | |
STORE outdata INTO '$output_file' USING PigStorage(',') ; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE ${hiveconf:table} (subject STRING, predicate STRING, object STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '${hiveconf:tab_file}'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REGISTER RDFStorage.jar ; | |
indata = LOAD '$input_file' USING RDFStorage() AS (s,p,o) ; | |
STORE indata INTO '$output_file' USING PigStorage('\t') ; |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 2 in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Note: Date chosen between '2010-10-14' AND '2012-12-31' | |
hashtag.object,amount | |
dbp:Battle,2484 | |
dbp:album,1964 | |
dbp:Adam,1500 | |
dbp:Levine,1500 | |
dbp:The,682 | |
dbp:War,616 | |
dbp:Chris,595 | |
dbp:Antonacci,463 | |
dbp:Biagio,463 | |
dbp:Kirkpatrick,433 |
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
t1.subject | |
sibpo:po7529 | |
sibpo:po7581 | |
sibpo:po7588 | |
sibpo:po7595 |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 2 columns, instead of 1 in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
WARNING: Results updated, still need to COUNT(*) the resulsts. | |
Below just the raw results | |
t1.subject | |
========== | |
sibu:u139 | |
sibu:u109 | |
sibu:u189 | |
sibu:u105 | |
sibu:u178 | |
sibu:u42 | |
sibu:u128 | |
sibu:u184 | |
sibu:u196 | |
sibu:u92 | |
sibu:u70 | |
sibu:u108 | |
sibu:u174 |
We can make this file beautiful and searchable if this error is corrected: Illegal quoting in line 1.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Note: Used group "g1" from sibdataset400.nt | |
res.subject,res.amount_in | |
sibu:u0,6 | |
sibu:u1,3 | |
sibu:u10,3 | |
sibu:u100,7 | |
sibu:u101,7 | |
sibu:u102,5 | |
sibu:u103,3 | |
sibu:u104,7 | |
sibu:u105,5 | |
sibu:u106,4 | |
sibu:u107,5 | |
sibu:u108,7 | |
sibu:u110,4 | |
sibu:u111,7 | |
sibu:u112,3 | |
sibu:u114,3 | |
sibu:u115,10 | |
sibu:u117,11 | |
sibu:u118,3 | |
sibu:u120,3 | |
sibu:u122,7 | |
sibu:u123,9 | |
sibu:u127,4 | |
sibu:u13,4 | |
sibu:u131,6 | |
sibu:u133,4 | |
sibu:u134,3 | |
sibu:u135,9 | |
sibu:u139,4 | |
sibu:u140,7 | |
sibu:u146,3 | |
sibu:u149,3 | |
sibu:u152,4 | |
sibu:u153,3 | |
sibu:u157,5 | |
sibu:u158,5 | |
sibu:u16,3 | |
sibu:u162,4 | |
sibu:u165,3 | |
sibu:u168,6 | |
sibu:u169,5 | |
sibu:u170,3 | |
sibu:u172,4 | |
sibu:u174,6 | |
sibu:u176,3 | |
sibu:u179,3 | |
sibu:u181,4 | |
sibu:u186,6 | |
sibu:u191,3 | |
sibu:u194,5 | |
sibu:u20,3 | |
sibu:u204,3 | |
sibu:u205,3 | |
sibu:u207,4 | |
sibu:u209,5 | |
sibu:u21,4 | |
sibu:u210,5 | |
sibu:u211,3 | |
sibu:u212,5 | |
sibu:u213,4 | |
sibu:u214,4 | |
sibu:u219,3 | |
sibu:u22,3 | |
sibu:u225,3 | |
sibu:u226,3 | |
sibu:u227,7 | |
sibu:u231,3 | |
sibu:u236,3 | |
sibu:u237,3 | |
sibu:u238,3 | |
sibu:u24,4 | |
sibu:u241,4 | |
sibu:u244,3 | |
sibu:u251,4 | |
sibu:u252,5 | |
sibu:u254,3 | |
sibu:u265,5 | |
sibu:u271,5 | |
sibu:u273,3 | |
sibu:u277,4 | |
sibu:u28,4 | |
sibu:u280,4 | |
sibu:u287,4 | |
sibu:u29,7 | |
sibu:u291,4 | |
sibu:u295,3 | |
sibu:u296,3 | |
sibu:u299,3 | |
sibu:u30,3 | |
sibu:u305,5 | |
sibu:u307,3 | |
sibu:u310,5 | |
sibu:u318,3 | |
sibu:u320,3 | |
sibu:u324,5 | |
sibu:u325,6 | |
sibu:u326,5 | |
sibu:u327,4 | |
sibu:u328,4 | |
sibu:u334,3 | |
sibu:u336,3 | |
sibu:u338,3 | |
sibu:u34,8 | |
sibu:u340,3 | |
sibu:u341,3 | |
sibu:u345,4 | |
sibu:u350,5 | |
sibu:u351,3 | |
sibu:u352,7 | |
sibu:u353,4 | |
sibu:u357,3 | |
sibu:u36,3 | |
sibu:u360,3 | |
sibu:u362,3 | |
sibu:u364,3 | |
sibu:u375,3 | |
sibu:u378,4 | |
sibu:u387,6 | |
sibu:u39,3 | |
sibu:u392,3 | |
sibu:u394,3 | |
sibu:u398,3 | |
sibu:u40,7 | |
sibu:u44,3 | |
sibu:u48,3 | |
sibu:u5,4 | |
sibu:u51,3 | |
sibu:u54,3 | |
sibu:u58,4 | |
sibu:u59,3 | |
sibu:u6,5 | |
sibu:u60,3 | |
sibu:u67,4 | |
sibu:u69,3 | |
sibu:u70,4 | |
sibu:u71,6 | |
sibu:u72,7 | |
sibu:u73,6 | |
sibu:u74,6 | |
sibu:u76,4 | |
sibu:u79,7 | |
sibu:u81,6 | |
sibu:u82,11 | |
sibu:u84,4 | |
sibu:u85,5 | |
sibu:u88,30 | |
sibu:u89,7 | |
sibu:u91,10 | |
sibu:u92,4 | |
sibu:u93,4 | |
sibu:u94,4 | |
sibu:u96,5 | |
sibu:u98,9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
likes.object | likes.amount | comments.amount | ratio | |
---|---|---|---|---|
sibpo:po10978 | 1 | 19 | 0.0526315789474 | |
sibpo:po7126 | 1 | 19 | 0.0526315789474 | |
sibpo:po15828 | 1 | 19 | 0.0526315789474 | |
sibpo:po20321 | 1 | 19 | 0.0526315789474 | |
sibpo:po28088 | 1 | 19 | 0.0526315789474 | |
sibpo:po9725 | 1 | 19 | 0.0526315789474 | |
sibpo:po9717 | 1 | 19 | 0.0526315789474 | |
sibpo:po22738 | 1 | 19 | 0.0526315789474 | |
sibpo:po4385 | 1 | 19 | 0.0526315789474 | |
sibpo:po13308 | 1 | 19 | 0.0526315789474 | |
sibpo:po12674 | 1 | 18 | 0.0555555555556 | |
sibpo:po15728 | 1 | 18 | 0.0555555555556 | |
sibpo:po22771 | 1 | 18 | 0.0555555555556 | |
sibpo:po9562 | 1 | 18 | 0.0555555555556 | |
sibpo:po6463 | 1 | 18 | 0.0555555555556 | |
sibpo:po27731 | 1 | 18 | 0.0555555555556 | |
sibpo:po24746 | 1 | 18 | 0.0555555555556 | |
sibpo:po22710 | 1 | 18 | 0.0555555555556 | |
sibpo:po11611 | 1 | 18 | 0.0555555555556 | |
sibpo:po15973 | 1 | 18 | 0.0555555555556 | |
sibpo:po7489 | 1 | 18 | 0.0555555555556 | |
sibpo:po7482 | 1 | 18 | 0.0555555555556 | |
sibpo:po8815 | 1 | 18 | 0.0555555555556 | |
sibpo:po8380 | 1 | 17 | 0.0588235294118 | |
sibpo:po19688 | 1 | 17 | 0.0588235294118 | |
sibpo:po28094 | 1 | 17 | 0.0588235294118 | |
sibpo:po8986 | 1 | 17 | 0.0588235294118 | |
sibpo:po11409 | 1 | 17 | 0.0588235294118 | |
sibpo:po24323 | 1 | 17 | 0.0588235294118 | |
sibpo:po30107 | 1 | 17 | 0.0588235294118 | |
sibpo:po26033 | 1 | 17 | 0.0588235294118 | |
sibpo:po8313 | 1 | 17 | 0.0588235294118 | |
sibpo:po3769 | 1 | 17 | 0.0588235294118 | |
sibpo:po15757 | 1 | 17 | 0.0588235294118 | |
sibpo:po21688 | 1 | 17 | 0.0588235294118 | |
sibpo:po5376 | 1 | 17 | 0.0588235294118 | |
sibpo:po23437 | 1 | 17 | 0.0588235294118 | |
sibpo:po6001 | 1 | 17 | 0.0588235294118 | |
sibpo:po28991 | 1 | 17 | 0.0588235294118 | |
sibpo:po18156 | 1 | 17 | 0.0588235294118 | |
sibpo:po8749 | 1 | 17 | 0.0588235294118 | |
sibpo:po6246 | 1 | 17 | 0.0588235294118 | |
sibpo:po21917 | 1 | 17 | 0.0588235294118 | |
sibpo:po20952 | 1 | 17 | 0.0588235294118 | |
sibpo:po21876 | 1 | 17 | 0.0588235294118 | |
sibpo:po16522 | 1 | 17 | 0.0588235294118 | |
sibpo:po23509 | 1 | 17 | 0.0588235294118 | |
sibpo:po14344 | 1 | 17 | 0.0588235294118 | |
sibpo:po9447 | 1 | 16 | 0.0625 | |
sibpo:po7747 | 1 | 16 | 0.0625 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Full path to input file or dir | |
INPUT_FILE='/home/cloudera/Downloads/input/sibdataset200.nt' | |
INPUT_FILE400='/home/cloudera/Downloads/input/sibdataset400.nt' | |
# Output path for tab dir (from which we move to hdfs) | |
TAB_DIR='/home/cloudera/Downloads/ex6/' | |
#Name of the tab file | |
TAB_FILE='sibdataset_tab' | |
TAB_FILE400='sibdataset_tab400' | |
# Directory on HDFS where to write and read the tab file from | |
HDFS_DIR='/user/cloudera/' | |
# Name of the hive table | |
TABLE='ewaldf_sibdataset' | |
TABLE400='ewaldf_sibdataset400' | |
# Prepare the tabbed output (only run once) | |
# If you want to repeat the run, then first: "DROP TABLE $TABLE" AND DELETE output files :-) | |
pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR$TAB_FILE prepare.pig | |
hadoop fs -put $TAB_DIR$TAB_FILE/ $HDFS_DIR | |
hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE -hiveconf table=$TABLE -f prepare.hive | |
# Prepare the 400er output | |
pig -x local -param input_file=$INPUT_FILE400 -param output_file=$TAB_DIR$TAB_FILE400 prepare.pig | |
hadoop fs -put $TAB_DIR$TAB_FILE400/ $HDFS_DIR | |
hive -hiveconf tab_file=$HDFS_DIR/$TAB_FILE400 -hiveconf table=$TABLE400 -f prepare.hive | |
# Task 6_1 | |
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_1 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_1.hive | |
# Task 6_2 | |
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_2 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -hiveconf hashtag=Tunisia -f 6_2.hive | |
# Task 6_3 | |
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_3 -hiveconf start_date=2010-12-01 -hiveconf stop_date=2010-12-31 -f 6_3.hive | |
# Task 6_4 | |
hive -hiveconf table=$TABLE400 -hiveconf output_file=$TAB_DIR/6_4 -hiveconf group=sibg:g1 -f 6_4.hive | |
# Task 6_5_1 | |
hive -hiveconf table=$TABLE -hiveconf output_file=$TAB_DIR/6_5_1 -f 6_5_1.hive | |
# Task 6_5_2 | |
pig -x local -param input_file=$INPUT_FILE -param output_file=$TAB_DIR/6_5_2 6_5_2.pig |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment