Skip to content

Instantly share code, notes, and snippets.

@kerinin
Last active August 29, 2015 14:02
Show Gist options
  • Save kerinin/98e0e620c490e1dcd976 to your computer and use it in GitHub Desktop.
Save kerinin/98e0e620c490e1dcd976 to your computer and use it in GitHub Desktop.
-- This will need to be built and put on the machine you're running Hive from
add jar json-serde-1.1.9.3-SNAPSHOT-jar-with-dependencies.jar;
-- Creates the table based on the JSON schema
CREATE TABLE json_nested_test (
metadata struct<uuid:string,md5_email:string,received_at:string,message_id:string,from_domain:string,message_signature:string,provider:string>,
headers map<string,struct<length:string,match_counts:map<string,string>,text_hash_counts:array<array<string>>,address_hashes:array<string>>>,
parts array<struct<mime_type:string,length:string,stripped_length:string,match_counts:map<string,string>,urls:array<struct<domain:string,sha:string>>,images:array<struct<domain:string,sha:string,pixel_count:string>>,text_hash_counts:array<array<string>>>>,
attachments array<struct<mime_type:string,bytesize:string,filename_sha:string,filetype:string>>
)
ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
STORED AS TEXTFILE;
-- Eliminates exceptions if the JSON has issues (duplicate keys, etc)
ALTER TABLE json_nested_test SET SERDEPROPERTIES ( "ignore.malformed.json" = "true");
-- Loads one of the files
LOAD DATA LOCAL INPATH '/na/mapr/data/extracted_message_features/v1/2014/21/2014-06-01/p_39_1648813793180' OVERWRITE INTO TABLE json_nested_test ;
-- Select the match counts where the first part is 'text/html'
SELECT parts[0].match_counts
FROM json_nested_test
WHERE parts[0].mime_type == 'text/html'
LIMIT 10;
-- Select [uuid, hash, count] triplets for all 'text/plain' parts
SELECT metadata.uuid, part_hash[0], part_hash[1]
FROM
json_nested_test
LATERAL VIEW explode(parts.text_hash_counts) LineHashes as line_hash
LATERAL VIEW explode(parts.mime_type) MimeTypes as mime_type
LATERAL VIEW explode(line_hash) PartHashes as part_hash
WHERE mime_type == 'text/plain'
LIMIT 10;
DROP TABLE json_nested_test;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment