Created
July 13, 2016 10:00
-
-
Save rmoff/a38e853d3a7dcb48a9df99ce1e3505ff to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Instantiate the BDD Shell context" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"execfile('ipython/00-bdd-shell-init.py')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Import JSON using SparkSQL\n", | |
"\n", | |
"(Data source: Twitter API)\n", | |
"\n", | |
"Note that SparkSQL expects the JSON to be a single record per line. If you've got a single JSON object then you'll need to modify this code accordingly. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Read a folder of files\n", | |
"df = sqlContext.read.json('hdfs:///user/oracle/incoming/twitter/2016/07/12/')\n", | |
"# Read a specific file\n", | |
"#df = sqlContext.read.json('hdfs:///user/oracle/incoming/twitter/2016/07/12/FlumeData.1468339844123')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Register temporary table - necessary for subsequent SQL queries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df.registerTempTable(\"twitter\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Record count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 57, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+----+\n", | |
"| _c0|\n", | |
"+----+\n", | |
"|3011|\n", | |
"+----+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"result_df = sqlContext.sql(\"select count(*) from twitter\")\n", | |
"result_df.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Preview schema - nesting (`struct`) and arrays (`array`) ahoy!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"root\n", | |
" |-- @timestamp: string (nullable = true)\n", | |
" |-- @version: string (nullable = true)\n", | |
" |-- _corrupt_record: string (nullable = true)\n", | |
" |-- contributors: string (nullable = true)\n", | |
" |-- coordinates: struct (nullable = true)\n", | |
" | |-- coordinates: array (nullable = true)\n", | |
" | | |-- element: double (containsNull = true)\n", | |
" | |-- type: string (nullable = true)\n", | |
" |-- created_at: string (nullable = true)\n", | |
" |-- entities: struct (nullable = true)\n", | |
" | |-- hashtags: array (nullable = true)\n", | |
" | | |-- element: struct (containsNull = true)\n", | |
" | | | |-- indices: array (nullable = true)\n", | |
" | | | | |-- element: long (containsNull = true)\n", | |
" | | | |-- text: string (nullable = true)\n", | |
" | |-- media: array (nullable = true)\n", | |
" | | |-- element: struct (containsNull = true)\n", | |
" | | | |-- display_url: string (nullable = true)\n", | |
" | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | |-- id: long (nullable = true)\n", | |
" | | | |-- id_str: string (nullable = true)\n", | |
" | | | |-- indices: array (nullable = true)\n", | |
" | | | | |-- element: long (containsNull = true)\n", | |
" | | | |-- media_url: string (nullable = true)\n", | |
" | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | |-- source_status_id: long (nullable = true)\n", | |
" | | | |-- source_status_id_str: string (nullable = true)\n", | |
" | | | |-- source_user_id: long (nullable = true)\n", | |
" | | | |-- source_user_id_str: string (nullable = true)\n", | |
" | | | |-- type: string (nullable = true)\n", | |
" | | | |-- url: string (nullable = true)\n", | |
" | |-- symbols: array (nullable = true)\n", | |
" | | |-- element: struct (containsNull = true)\n", | |
" | | | |-- indices: array (nullable = true)\n", | |
" | | | | |-- element: long (containsNull = true)\n", | |
" | | | |-- text: string (nullable = true)\n", | |
" | |-- urls: array (nullable = true)\n", | |
" | | |-- element: struct (containsNull = true)\n", | |
" | | | |-- display_url: string (nullable = true)\n", | |
" | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | |-- indices: array (nullable = true)\n", | |
" | | | | |-- element: long (containsNull = true)\n", | |
" | | | |-- url: string (nullable = true)\n", | |
" | |-- user_mentions: array (nullable = true)\n", | |
" | | |-- element: struct (containsNull = true)\n", | |
" | | | |-- id: long (nullable = true)\n", | |
" | | | |-- id_str: string (nullable = true)\n", | |
" | | | |-- indices: array (nullable = true)\n", | |
" | | | | |-- element: long (containsNull = true)\n", | |
" | | | |-- name: string (nullable = true)\n", | |
" | | | |-- screen_name: string (nullable = true)\n", | |
" |-- extended_entities: struct (nullable = true)\n", | |
" | |-- media: array (nullable = true)\n", | |
" | | |-- element: struct (containsNull = true)\n", | |
" | | | |-- display_url: string (nullable = true)\n", | |
" | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | |-- id: long (nullable = true)\n", | |
" | | | |-- id_str: string (nullable = true)\n", | |
" | | | |-- indices: array (nullable = true)\n", | |
" | | | | |-- element: long (containsNull = true)\n", | |
" | | | |-- media_url: string (nullable = true)\n", | |
" | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | |-- w: long (nullable = true)\n", | |
" | | | |-- source_status_id: long (nullable = true)\n", | |
" | | | |-- source_status_id_str: string (nullable = true)\n", | |
" | | | |-- source_user_id: long (nullable = true)\n", | |
" | | | |-- source_user_id_str: string (nullable = true)\n", | |
" | | | |-- type: string (nullable = true)\n", | |
" | | | |-- url: string (nullable = true)\n", | |
" | | | |-- video_info: struct (nullable = true)\n", | |
" | | | | |-- aspect_ratio: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- duration_millis: long (nullable = true)\n", | |
" | | | | |-- variants: array (nullable = true)\n", | |
" | | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | | |-- bitrate: long (nullable = true)\n", | |
" | | | | | | |-- content_type: string (nullable = true)\n", | |
" | | | | | | |-- url: string (nullable = true)\n", | |
" |-- favorite_count: long (nullable = true)\n", | |
" |-- favorited: boolean (nullable = true)\n", | |
" |-- filter_level: string (nullable = true)\n", | |
" |-- geo: struct (nullable = true)\n", | |
" | |-- coordinates: array (nullable = true)\n", | |
" | | |-- element: double (containsNull = true)\n", | |
" | |-- type: string (nullable = true)\n", | |
" |-- id: long (nullable = true)\n", | |
" |-- id_str: string (nullable = true)\n", | |
" |-- in_reply_to_screen_name: string (nullable = true)\n", | |
" |-- in_reply_to_status_id: long (nullable = true)\n", | |
" |-- in_reply_to_status_id_str: string (nullable = true)\n", | |
" |-- in_reply_to_user_id: long (nullable = true)\n", | |
" |-- in_reply_to_user_id_str: string (nullable = true)\n", | |
" |-- is_quote_status: boolean (nullable = true)\n", | |
" |-- lang: string (nullable = true)\n", | |
" |-- place: struct (nullable = true)\n", | |
" | |-- bounding_box: struct (nullable = true)\n", | |
" | | |-- coordinates: array (nullable = true)\n", | |
" | | | |-- element: array (containsNull = true)\n", | |
" | | | | |-- element: array (containsNull = true)\n", | |
" | | | | | |-- element: double (containsNull = true)\n", | |
" | | |-- type: string (nullable = true)\n", | |
" | |-- country: string (nullable = true)\n", | |
" | |-- country_code: string (nullable = true)\n", | |
" | |-- full_name: string (nullable = true)\n", | |
" | |-- id: string (nullable = true)\n", | |
" | |-- name: string (nullable = true)\n", | |
" | |-- place_type: string (nullable = true)\n", | |
" | |-- url: string (nullable = true)\n", | |
" |-- possibly_sensitive: boolean (nullable = true)\n", | |
" |-- quoted_status: struct (nullable = true)\n", | |
" | |-- contributors: string (nullable = true)\n", | |
" | |-- coordinates: string (nullable = true)\n", | |
" | |-- created_at: string (nullable = true)\n", | |
" | |-- entities: struct (nullable = true)\n", | |
" | | |-- hashtags: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- text: string (nullable = true)\n", | |
" | | |-- media: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | |-- id: long (nullable = true)\n", | |
" | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- media_url: string (nullable = true)\n", | |
" | | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- type: string (nullable = true)\n", | |
" | | | | |-- url: string (nullable = true)\n", | |
" | | |-- symbols: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- text: string (nullable = true)\n", | |
" | | |-- urls: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- url: string (nullable = true)\n", | |
" | | |-- user_mentions: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- id: long (nullable = true)\n", | |
" | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- name: string (nullable = true)\n", | |
" | | | | |-- screen_name: string (nullable = true)\n", | |
" | |-- extended_entities: struct (nullable = true)\n", | |
" | | |-- media: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | |-- id: long (nullable = true)\n", | |
" | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- media_url: string (nullable = true)\n", | |
" | | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- type: string (nullable = true)\n", | |
" | | | | |-- url: string (nullable = true)\n", | |
" | | | | |-- video_info: struct (nullable = true)\n", | |
" | | | | | |-- aspect_ratio: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- duration_millis: long (nullable = true)\n", | |
" | | | | | |-- variants: array (nullable = true)\n", | |
" | | | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | | | |-- bitrate: long (nullable = true)\n", | |
" | | | | | | | |-- content_type: string (nullable = true)\n", | |
" | | | | | | | |-- url: string (nullable = true)\n", | |
" | |-- favorite_count: long (nullable = true)\n", | |
" | |-- favorited: boolean (nullable = true)\n", | |
" | |-- filter_level: string (nullable = true)\n", | |
" | |-- geo: string (nullable = true)\n", | |
" | |-- id: long (nullable = true)\n", | |
" | |-- id_str: string (nullable = true)\n", | |
" | |-- in_reply_to_screen_name: string (nullable = true)\n", | |
" | |-- in_reply_to_status_id: long (nullable = true)\n", | |
" | |-- in_reply_to_status_id_str: string (nullable = true)\n", | |
" | |-- in_reply_to_user_id: long (nullable = true)\n", | |
" | |-- in_reply_to_user_id_str: string (nullable = true)\n", | |
" | |-- is_quote_status: boolean (nullable = true)\n", | |
" | |-- lang: string (nullable = true)\n", | |
" | |-- place: struct (nullable = true)\n", | |
" | | |-- bounding_box: struct (nullable = true)\n", | |
" | | | |-- coordinates: array (nullable = true)\n", | |
" | | | | |-- element: array (containsNull = true)\n", | |
" | | | | | |-- element: array (containsNull = true)\n", | |
" | | | | | | |-- element: double (containsNull = true)\n", | |
" | | | |-- type: string (nullable = true)\n", | |
" | | |-- country: string (nullable = true)\n", | |
" | | |-- country_code: string (nullable = true)\n", | |
" | | |-- full_name: string (nullable = true)\n", | |
" | | |-- id: string (nullable = true)\n", | |
" | | |-- name: string (nullable = true)\n", | |
" | | |-- place_type: string (nullable = true)\n", | |
" | | |-- url: string (nullable = true)\n", | |
" | |-- possibly_sensitive: boolean (nullable = true)\n", | |
" | |-- quoted_status_id: long (nullable = true)\n", | |
" | |-- quoted_status_id_str: string (nullable = true)\n", | |
" | |-- retweet_count: long (nullable = true)\n", | |
" | |-- retweeted: boolean (nullable = true)\n", | |
" | |-- source: string (nullable = true)\n", | |
" | |-- text: string (nullable = true)\n", | |
" | |-- truncated: boolean (nullable = true)\n", | |
" | |-- user: struct (nullable = true)\n", | |
" | | |-- contributors_enabled: boolean (nullable = true)\n", | |
" | | |-- created_at: string (nullable = true)\n", | |
" | | |-- default_profile: boolean (nullable = true)\n", | |
" | | |-- default_profile_image: boolean (nullable = true)\n", | |
" | | |-- description: string (nullable = true)\n", | |
" | | |-- favourites_count: long (nullable = true)\n", | |
" | | |-- follow_request_sent: string (nullable = true)\n", | |
" | | |-- followers_count: long (nullable = true)\n", | |
" | | |-- following: string (nullable = true)\n", | |
" | | |-- friends_count: long (nullable = true)\n", | |
" | | |-- geo_enabled: boolean (nullable = true)\n", | |
" | | |-- id: long (nullable = true)\n", | |
" | | |-- id_str: string (nullable = true)\n", | |
" | | |-- is_translator: boolean (nullable = true)\n", | |
" | | |-- lang: string (nullable = true)\n", | |
" | | |-- listed_count: long (nullable = true)\n", | |
" | | |-- location: string (nullable = true)\n", | |
" | | |-- name: string (nullable = true)\n", | |
" | | |-- notifications: string (nullable = true)\n", | |
" | | |-- profile_background_color: string (nullable = true)\n", | |
" | | |-- profile_background_image_url: string (nullable = true)\n", | |
" | | |-- profile_background_image_url_https: string (nullable = true)\n", | |
" | | |-- profile_background_tile: boolean (nullable = true)\n", | |
" | | |-- profile_banner_url: string (nullable = true)\n", | |
" | | |-- profile_image_url: string (nullable = true)\n", | |
" | | |-- profile_image_url_https: string (nullable = true)\n", | |
" | | |-- profile_link_color: string (nullable = true)\n", | |
" | | |-- profile_sidebar_border_color: string (nullable = true)\n", | |
" | | |-- profile_sidebar_fill_color: string (nullable = true)\n", | |
" | | |-- profile_text_color: string (nullable = true)\n", | |
" | | |-- profile_use_background_image: boolean (nullable = true)\n", | |
" | | |-- protected: boolean (nullable = true)\n", | |
" | | |-- screen_name: string (nullable = true)\n", | |
" | | |-- statuses_count: long (nullable = true)\n", | |
" | | |-- time_zone: string (nullable = true)\n", | |
" | | |-- url: string (nullable = true)\n", | |
" | | |-- utc_offset: long (nullable = true)\n", | |
" | | |-- verified: boolean (nullable = true)\n", | |
" |-- quoted_status_id: long (nullable = true)\n", | |
" |-- quoted_status_id_str: string (nullable = true)\n", | |
" |-- retweet_count: long (nullable = true)\n", | |
" |-- retweeted: boolean (nullable = true)\n", | |
" |-- retweeted_status: struct (nullable = true)\n", | |
" | |-- contributors: string (nullable = true)\n", | |
" | |-- coordinates: struct (nullable = true)\n", | |
" | | |-- coordinates: array (nullable = true)\n", | |
" | | | |-- element: double (containsNull = true)\n", | |
" | | |-- type: string (nullable = true)\n", | |
" | |-- created_at: string (nullable = true)\n", | |
" | |-- entities: struct (nullable = true)\n", | |
" | | |-- hashtags: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- text: string (nullable = true)\n", | |
" | | |-- media: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | |-- id: long (nullable = true)\n", | |
" | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- media_url: string (nullable = true)\n", | |
" | | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- source_status_id: long (nullable = true)\n", | |
" | | | | |-- source_status_id_str: string (nullable = true)\n", | |
" | | | | |-- source_user_id: long (nullable = true)\n", | |
" | | | | |-- source_user_id_str: string (nullable = true)\n", | |
" | | | | |-- type: string (nullable = true)\n", | |
" | | | | |-- url: string (nullable = true)\n", | |
" | | |-- symbols: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- text: string (nullable = true)\n", | |
" | | |-- urls: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- url: string (nullable = true)\n", | |
" | | |-- user_mentions: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- id: long (nullable = true)\n", | |
" | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- name: string (nullable = true)\n", | |
" | | | | |-- screen_name: string (nullable = true)\n", | |
" | |-- extended_entities: struct (nullable = true)\n", | |
" | | |-- media: array (nullable = true)\n", | |
" | | | |-- element: struct (containsNull = true)\n", | |
" | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | |-- id: long (nullable = true)\n", | |
" | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | |-- media_url: string (nullable = true)\n", | |
" | | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | |-- source_status_id: long (nullable = true)\n", | |
" | | | | |-- source_status_id_str: string (nullable = true)\n", | |
" | | | | |-- source_user_id: long (nullable = true)\n", | |
" | | | | |-- source_user_id_str: string (nullable = true)\n", | |
" | | | | |-- type: string (nullable = true)\n", | |
" | | | | |-- url: string (nullable = true)\n", | |
" | | | | |-- video_info: struct (nullable = true)\n", | |
" | | | | | |-- aspect_ratio: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- duration_millis: long (nullable = true)\n", | |
" | | | | | |-- variants: array (nullable = true)\n", | |
" | | | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | | | |-- bitrate: long (nullable = true)\n", | |
" | | | | | | | |-- content_type: string (nullable = true)\n", | |
" | | | | | | | |-- url: string (nullable = true)\n", | |
" | |-- favorite_count: long (nullable = true)\n", | |
" | |-- favorited: boolean (nullable = true)\n", | |
" | |-- filter_level: string (nullable = true)\n", | |
" | |-- geo: struct (nullable = true)\n", | |
" | | |-- coordinates: array (nullable = true)\n", | |
" | | | |-- element: double (containsNull = true)\n", | |
" | | |-- type: string (nullable = true)\n", | |
" | |-- id: long (nullable = true)\n", | |
" | |-- id_str: string (nullable = true)\n", | |
" | |-- in_reply_to_screen_name: string (nullable = true)\n", | |
" | |-- in_reply_to_status_id: long (nullable = true)\n", | |
" | |-- in_reply_to_status_id_str: string (nullable = true)\n", | |
" | |-- in_reply_to_user_id: long (nullable = true)\n", | |
" | |-- in_reply_to_user_id_str: string (nullable = true)\n", | |
" | |-- is_quote_status: boolean (nullable = true)\n", | |
" | |-- lang: string (nullable = true)\n", | |
" | |-- place: struct (nullable = true)\n", | |
" | | |-- bounding_box: struct (nullable = true)\n", | |
" | | | |-- coordinates: array (nullable = true)\n", | |
" | | | | |-- element: array (containsNull = true)\n", | |
" | | | | | |-- element: array (containsNull = true)\n", | |
" | | | | | | |-- element: double (containsNull = true)\n", | |
" | | | |-- type: string (nullable = true)\n", | |
" | | |-- country: string (nullable = true)\n", | |
" | | |-- country_code: string (nullable = true)\n", | |
" | | |-- full_name: string (nullable = true)\n", | |
" | | |-- id: string (nullable = true)\n", | |
" | | |-- name: string (nullable = true)\n", | |
" | | |-- place_type: string (nullable = true)\n", | |
" | | |-- url: string (nullable = true)\n", | |
" | |-- possibly_sensitive: boolean (nullable = true)\n", | |
" | |-- quoted_status: struct (nullable = true)\n", | |
" | | |-- contributors: string (nullable = true)\n", | |
" | | |-- coordinates: string (nullable = true)\n", | |
" | | |-- created_at: string (nullable = true)\n", | |
" | | |-- entities: struct (nullable = true)\n", | |
" | | | |-- hashtags: array (nullable = true)\n", | |
" | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- text: string (nullable = true)\n", | |
" | | | |-- media: array (nullable = true)\n", | |
" | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | | |-- id: long (nullable = true)\n", | |
" | | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- media_url: string (nullable = true)\n", | |
" | | | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- type: string (nullable = true)\n", | |
" | | | | | |-- url: string (nullable = true)\n", | |
" | | | |-- symbols: array (nullable = true)\n", | |
" | | | | |-- element: string (containsNull = true)\n", | |
" | | | |-- urls: array (nullable = true)\n", | |
" | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- url: string (nullable = true)\n", | |
" | | | |-- user_mentions: array (nullable = true)\n", | |
" | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | |-- id: long (nullable = true)\n", | |
" | | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- name: string (nullable = true)\n", | |
" | | | | | |-- screen_name: string (nullable = true)\n", | |
" | | |-- extended_entities: struct (nullable = true)\n", | |
" | | | |-- media: array (nullable = true)\n", | |
" | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | |-- display_url: string (nullable = true)\n", | |
" | | | | | |-- expanded_url: string (nullable = true)\n", | |
" | | | | | |-- id: long (nullable = true)\n", | |
" | | | | | |-- id_str: string (nullable = true)\n", | |
" | | | | | |-- indices: array (nullable = true)\n", | |
" | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | |-- media_url: string (nullable = true)\n", | |
" | | | | | |-- media_url_https: string (nullable = true)\n", | |
" | | | | | |-- sizes: struct (nullable = true)\n", | |
" | | | | | | |-- large: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | | |-- medium: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | | |-- small: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | | |-- thumb: struct (nullable = true)\n", | |
" | | | | | | | |-- h: long (nullable = true)\n", | |
" | | | | | | | |-- resize: string (nullable = true)\n", | |
" | | | | | | | |-- w: long (nullable = true)\n", | |
" | | | | | |-- type: string (nullable = true)\n", | |
" | | | | | |-- url: string (nullable = true)\n", | |
" | | | | | |-- video_info: struct (nullable = true)\n", | |
" | | | | | | |-- aspect_ratio: array (nullable = true)\n", | |
" | | | | | | | |-- element: long (containsNull = true)\n", | |
" | | | | | | |-- variants: array (nullable = true)\n", | |
" | | | | | | | |-- element: struct (containsNull = true)\n", | |
" | | | | | | | | |-- bitrate: long (nullable = true)\n", | |
" | | | | | | | | |-- content_type: string (nullable = true)\n", | |
" | | | | | | | | |-- url: string (nullable = true)\n", | |
" | | |-- favorite_count: long (nullable = true)\n", | |
" | | |-- favorited: boolean (nullable = true)\n", | |
" | | |-- filter_level: string (nullable = true)\n", | |
" | | |-- geo: string (nullable = true)\n", | |
" | | |-- id: long (nullable = true)\n", | |
" | | |-- id_str: string (nullable = true)\n", | |
" | | |-- in_reply_to_screen_name: string (nullable = true)\n", | |
" | | |-- in_reply_to_status_id: long (nullable = true)\n", | |
" | | |-- in_reply_to_status_id_str: string (nullable = true)\n", | |
" | | |-- in_reply_to_user_id: long (nullable = true)\n", | |
" | | |-- in_reply_to_user_id_str: string (nullable = true)\n", | |
" | | |-- is_quote_status: boolean (nullable = true)\n", | |
" | | |-- lang: string (nullable = true)\n", | |
" | | |-- place: string (nullable = true)\n", | |
" | | |-- possibly_sensitive: boolean (nullable = true)\n", | |
" | | |-- quoted_status_id: long (nullable = true)\n", | |
" | | |-- quoted_status_id_str: string (nullable = true)\n", | |
" | | |-- retweet_count: long (nullable = true)\n", | |
" | | |-- retweeted: boolean (nullable = true)\n", | |
" | | |-- source: string (nullable = true)\n", | |
" | | |-- text: string (nullable = true)\n", | |
" | | |-- truncated: boolean (nullable = true)\n", | |
" | | |-- user: struct (nullable = true)\n", | |
" | | | |-- contributors_enabled: boolean (nullable = true)\n", | |
" | | | |-- created_at: string (nullable = true)\n", | |
" | | | |-- default_profile: boolean (nullable = true)\n", | |
" | | | |-- default_profile_image: boolean (nullable = true)\n", | |
" | | | |-- description: string (nullable = true)\n", | |
" | | | |-- favourites_count: long (nullable = true)\n", | |
" | | | |-- follow_request_sent: string (nullable = true)\n", | |
" | | | |-- followers_count: long (nullable = true)\n", | |
" | | | |-- following: string (nullable = true)\n", | |
" | | | |-- friends_count: long (nullable = true)\n", | |
" | | | |-- geo_enabled: boolean (nullable = true)\n", | |
" | | | |-- id: long (nullable = true)\n", | |
" | | | |-- id_str: string (nullable = true)\n", | |
" | | | |-- is_translator: boolean (nullable = true)\n", | |
" | | | |-- lang: string (nullable = true)\n", | |
" | | | |-- listed_count: long (nullable = true)\n", | |
" | | | |-- location: string (nullable = true)\n", | |
" | | | |-- name: string (nullable = true)\n", | |
" | | | |-- notifications: string (nullable = true)\n", | |
" | | | |-- profile_background_color: string (nullable = true)\n", | |
" | | | |-- profile_background_image_url: string (nullable = true)\n", | |
" | | | |-- profile_background_image_url_https: string (nullable = true)\n", | |
" | | | |-- profile_background_tile: boolean (nullable = true)\n", | |
" | | | |-- profile_banner_url: string (nullable = true)\n", | |
" | | | |-- profile_image_url: string (nullable = true)\n", | |
" | | | |-- profile_image_url_https: string (nullable = true)\n", | |
" | | | |-- profile_link_color: string (nullable = true)\n", | |
" | | | |-- profile_sidebar_border_color: string (nullable = true)\n", | |
" | | | |-- profile_sidebar_fill_color: string (nullable = true)\n", | |
" | | | |-- profile_text_color: string (nullable = true)\n", | |
" | | | |-- profile_use_background_image: boolean (nullable = true)\n", | |
" | | | |-- protected: boolean (nullable = true)\n", | |
" | | | |-- screen_name: string (nullable = true)\n", | |
" | | | |-- statuses_count: long (nullable = true)\n", | |
" | | | |-- time_zone: string (nullable = true)\n", | |
" | | | |-- url: string (nullable = true)\n", | |
" | | | |-- utc_offset: long (nullable = true)\n", | |
" | | | |-- verified: boolean (nullable = true)\n", | |
" | |-- quoted_status_id: long (nullable = true)\n", | |
" | |-- quoted_status_id_str: string (nullable = true)\n", | |
" | |-- retweet_count: long (nullable = true)\n", | |
" | |-- retweeted: boolean (nullable = true)\n", | |
" | |-- scopes: struct (nullable = true)\n", | |
" | | |-- followers: boolean (nullable = true)\n", | |
" | |-- source: string (nullable = true)\n", | |
" | |-- text: string (nullable = true)\n", | |
" | |-- truncated: boolean (nullable = true)\n", | |
" | |-- user: struct (nullable = true)\n", | |
" | | |-- contributors_enabled: boolean (nullable = true)\n", | |
" | | |-- created_at: string (nullable = true)\n", | |
" | | |-- default_profile: boolean (nullable = true)\n", | |
" | | |-- default_profile_image: boolean (nullable = true)\n", | |
" | | |-- description: string (nullable = true)\n", | |
" | | |-- favourites_count: long (nullable = true)\n", | |
" | | |-- follow_request_sent: string (nullable = true)\n", | |
" | | |-- followers_count: long (nullable = true)\n", | |
" | | |-- following: string (nullable = true)\n", | |
" | | |-- friends_count: long (nullable = true)\n", | |
" | | |-- geo_enabled: boolean (nullable = true)\n", | |
" | | |-- id: long (nullable = true)\n", | |
" | | |-- id_str: string (nullable = true)\n", | |
" | | |-- is_translator: boolean (nullable = true)\n", | |
" | | |-- lang: string (nullable = true)\n", | |
" | | |-- listed_count: long (nullable = true)\n", | |
" | | |-- location: string (nullable = true)\n", | |
" | | |-- name: string (nullable = true)\n", | |
" | | |-- notifications: string (nullable = true)\n", | |
" | | |-- profile_background_color: string (nullable = true)\n", | |
" | | |-- profile_background_image_url: string (nullable = true)\n", | |
" | | |-- profile_background_image_url_https: string (nullable = true)\n", | |
" | | |-- profile_background_tile: boolean (nullable = true)\n", | |
" | | |-- profile_banner_url: string (nullable = true)\n", | |
" | | |-- profile_image_url: string (nullable = true)\n", | |
" | | |-- profile_image_url_https: string (nullable = true)\n", | |
" | | |-- profile_link_color: string (nullable = true)\n", | |
" | | |-- profile_sidebar_border_color: string (nullable = true)\n", | |
" | | |-- profile_sidebar_fill_color: string (nullable = true)\n", | |
" | | |-- profile_text_color: string (nullable = true)\n", | |
" | | |-- profile_use_background_image: boolean (nullable = true)\n", | |
" | | |-- protected: boolean (nullable = true)\n", | |
" | | |-- screen_name: string (nullable = true)\n", | |
" | | |-- statuses_count: long (nullable = true)\n", | |
" | | |-- time_zone: string (nullable = true)\n", | |
" | | |-- url: string (nullable = true)\n", | |
" | | |-- utc_offset: long (nullable = true)\n", | |
" | | |-- verified: boolean (nullable = true)\n", | |
" |-- source: string (nullable = true)\n", | |
" |-- text: string (nullable = true)\n", | |
" |-- timestamp_ms: string (nullable = true)\n", | |
" |-- truncated: boolean (nullable = true)\n", | |
" |-- user: struct (nullable = true)\n", | |
" | |-- contributors_enabled: boolean (nullable = true)\n", | |
" | |-- created_at: string (nullable = true)\n", | |
" | |-- default_profile: boolean (nullable = true)\n", | |
" | |-- default_profile_image: boolean (nullable = true)\n", | |
" | |-- description: string (nullable = true)\n", | |
" | |-- favourites_count: long (nullable = true)\n", | |
" | |-- follow_request_sent: string (nullable = true)\n", | |
" | |-- followers_count: long (nullable = true)\n", | |
" | |-- following: string (nullable = true)\n", | |
" | |-- friends_count: long (nullable = true)\n", | |
" | |-- geo_enabled: boolean (nullable = true)\n", | |
" | |-- id: long (nullable = true)\n", | |
" | |-- id_str: string (nullable = true)\n", | |
" | |-- is_translator: boolean (nullable = true)\n", | |
" | |-- lang: string (nullable = true)\n", | |
" | |-- listed_count: long (nullable = true)\n", | |
" | |-- location: string (nullable = true)\n", | |
" | |-- name: string (nullable = true)\n", | |
" | |-- notifications: string (nullable = true)\n", | |
" | |-- profile_background_color: string (nullable = true)\n", | |
" | |-- profile_background_image_url: string (nullable = true)\n", | |
" | |-- profile_background_image_url_https: string (nullable = true)\n", | |
" | |-- profile_background_tile: boolean (nullable = true)\n", | |
" | |-- profile_banner_url: string (nullable = true)\n", | |
" | |-- profile_image_url: string (nullable = true)\n", | |
" | |-- profile_image_url_https: string (nullable = true)\n", | |
" | |-- profile_link_color: string (nullable = true)\n", | |
" | |-- profile_sidebar_border_color: string (nullable = true)\n", | |
" | |-- profile_sidebar_fill_color: string (nullable = true)\n", | |
" | |-- profile_text_color: string (nullable = true)\n", | |
" | |-- profile_use_background_image: boolean (nullable = true)\n", | |
" | |-- protected: boolean (nullable = true)\n", | |
" | |-- screen_name: string (nullable = true)\n", | |
" | |-- statuses_count: long (nullable = true)\n", | |
" | |-- time_zone: string (nullable = true)\n", | |
" | |-- url: string (nullable = true)\n", | |
" | |-- utc_offset: long (nullable = true)\n", | |
" | |-- verified: boolean (nullable = true)\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df.printSchema()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+--------+------------+-----------+--------------------+--------------------+-----------------+--------------+---------+------------+----+------------------+------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-------------+----------------+--------------------+-------------+---------+----------------+--------------------+--------------------+-------------+---------+--------------------+\n", | |
"| @timestamp|@version|contributors|coordinates| created_at| entities|extended_entities|favorite_count|favorited|filter_level| geo| id| id_str|in_reply_to_screen_name|in_reply_to_status_id|in_reply_to_status_id_str|in_reply_to_user_id|in_reply_to_user_id_str|is_quote_status|lang|place|possibly_sensitive|quoted_status|quoted_status_id|quoted_status_id_str|retweet_count|retweeted|retweeted_status| source| text| timestamp_ms|truncated| user|\n", | |
"+--------------------+--------+------------+-----------+--------------------+--------------------+-----------------+--------------+---------+------------+----+------------------+------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-------------+----------------+--------------------+-------------+---------+----------------+--------------------+--------------------+-------------+---------+--------------------+\n", | |
"|2016-07-12T16:13:...| 1| null| null|Tue Jul 12 16:13:...|[WrappedArray([Wr...| null| 0| false| low|null|752898589408399360|752898589408399360| null| null| null| null| null| false| en| null| null| null| null| null| 0| false| null|<a href=\"http://t...|\"Students need to...|1468339986551| false|[false,Thu May 21...|\n", | |
"+--------------------+--------+------------+-----------+--------------------+--------------------+-----------------+--------------+---------+------------+----+------------------+------------------+-----------------------+---------------------+-------------------------+-------------------+-----------------------+---------------+----+-----+------------------+-------------+----------------+--------------------+-------------+---------+----------------+--------------------+--------------------+-------------+---------+--------------------+\n", | |
"only showing top 1 row\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"df.show(1)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Write the Spark dataframe as a Hive table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tablename = 'twitter_full'\n", | |
"qualified_tablename='default.' + tablename\n", | |
"df.write.mode('Overwrite').saveAsTable(qualified_tablename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"_N.B. this table created in Hive is invalid:_\n", | |
" \n", | |
" `Error while processing statement: FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. Error: > expected at the position 8430 of 'string:string:string:struct<coordinates:array<double>,type:s [...] n>' but ':' is found.`" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Create a new dataframe using a very simple SQL query against the loaded JSON" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"subset01=sqlContext.sql(\"SELECT text FROM twitter\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Show subset contents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+\n", | |
"| text|\n", | |
"+--------------------+\n", | |
"|\"Students need to...|\n", | |
"|Ga caya :( https:...|\n", | |
"|Want to work at O...|\n", | |
"|RT @KaumABs: #AB ...|\n", | |
"|RT @tika980615: t...|\n", | |
"|RT @UrkMcGurk: Re...|\n", | |
"|RT @MongoDB: Lear...|\n", | |
"|RT @PhilippeLavau...|\n", | |
"|PASTI LAH, GUA KA...|\n", | |
"|RT @GC_Tabii: Bua...|\n", | |
"+--------------------+\n", | |
"only showing top 10 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"subset01.show(10)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Use dot notation to access nested objects" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"subset02 = sqlContext.sql(\"SELECT created_at, user.screen_name, text FROM twitter\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Show contents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+--------------------+--------------+--------------------+\n", | |
"| created_at| screen_name| text|\n", | |
"+--------------------+--------------+--------------------+\n", | |
"|Tue Jul 12 16:13:...| Snehalstocks|\"Students need to...|\n", | |
"|Tue Jul 12 16:13:...| KingMarkT93|Ga caya :( https:...|\n", | |
"|Tue Jul 12 16:13:...| tmj_usa_sales|Want to work at O...|\n", | |
"|Tue Jul 12 16:13:...| otyoonsu|RT @KaumABs: #AB ...|\n", | |
"|Tue Jul 12 16:13:...| adline_adlina|RT @tika980615: t...|\n", | |
"|Tue Jul 12 16:13:...| AFlyLady|RT @UrkMcGurk: Re...|\n", | |
"|Tue Jul 12 16:13:...| __luizfilipe|RT @MongoDB: Lear...|\n", | |
"|Tue Jul 12 16:13:...| Oracle_France|RT @PhilippeLavau...|\n", | |
"|Tue Jul 12 16:13:...|parkjiyeonnn93|PASTI LAH, GUA KA...|\n", | |
"|Tue Jul 12 16:13:...|parkjiyeonnn93|RT @GC_Tabii: Bua...|\n", | |
"|Tue Jul 12 16:13:...| iis_ryanii|RT @AyukkAy: Kudu...|\n", | |
"|Tue Jul 12 16:13:...| JokoMonte|Vi este gif y pen...|\n", | |
"|Tue Jul 12 16:13:...| FranckPachot|RT @JLOracle: If ...|\n", | |
"|Tue Jul 12 16:13:...| jitu1114|#Hiring Req: Orac...|\n", | |
"|Tue Jul 12 16:13:...| skawwngur|@jjcceco_ kudu pi...|\n", | |
"|Tue Jul 12 16:13:...| arrgh|RT @ibdknox: Hado...|\n", | |
"|Tue Jul 12 16:13:...| wnda_|pap ketiak sendir...|\n", | |
"|Tue Jul 12 16:13:...| jmdigne|Using the Oracle ...|\n", | |
"|Tue Jul 12 16:14:...| YenniiJenny|RT @CangkemuAmbro...|\n", | |
"|Tue Jul 12 16:14:...| theflynews|NetSuite is conti...|\n", | |
"+--------------------+--------------+--------------------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"subset02.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Write the Spark dataframe as a Hive table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tablename = 'twitter_user_text'\n", | |
"qualified_tablename='default.' + tablename\n", | |
"subset02.write.mode('Overwrite').saveAsTable(qualified_tablename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## 'Explode' array objects, such as hashtags and demonstrate filtering" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"subset03 = sqlContext.sql(\"SELECT id, created_at, user.screen_name,text as tweet_text,hashtag.text as hashtag,user_mentions.screen_name as mentioned_user from twitter LATERAL VIEW OUTER explode(entities.user_mentions) user_mentionsTable as user_mentions LATERAL VIEW OUTER explode(entities.hashtags) hashtagsTable AS hashtag WHERE id = '752940179569115100'\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Show contents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+------------------+--------------------+-----------+--------------------+-------+--------------+\n", | |
"| id| created_at|screen_name| tweet_text|hashtag|mentioned_user|\n", | |
"+------------------+--------------------+-----------+--------------------+-------+--------------+\n", | |
"|752940179569115136|Tue Jul 12 18:58:...| flederbine|@johnnyq72 @orcld...|ImALLin| johnnyq72|\n", | |
"|752940179569115136|Tue Jul 12 18:58:...| flederbine|@johnnyq72 @orcld...|ImALLin| orcldoug|\n", | |
"|752940179569115136|Tue Jul 12 18:58:...| flederbine|@johnnyq72 @orcld...|ImALLin| rmoff|\n", | |
"|752940179569115136|Tue Jul 12 18:58:...| flederbine|@johnnyq72 @orcld...|ImALLin| markrittman|\n", | |
"|752940179569115136|Tue Jul 12 18:58:...| flederbine|@johnnyq72 @orcld...|ImALLin| mikedurran|\n", | |
"+------------------+--------------------+-----------+--------------------+-------+--------------+\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"subset03.show()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Write the Spark dataframe as a Hive table" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"tablename = 'twitter_user_text_hashtags'\n", | |
"qualified_tablename='default.' + tablename\n", | |
"subset03.write.mode('Overwrite').saveAsTable(qualified_tablename)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Run an aggregation, showing which hashtags are the most common in the dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"+-----------+----------+\n", | |
"| text|inst_count|\n", | |
"+-----------+----------+\n", | |
"| Hadoop| 165|\n", | |
"| Oracle| 151|\n", | |
"| job| 128|\n", | |
"| BigData| 112|\n", | |
"| CareerArc| 109|\n", | |
"|Hearthstone| 102|\n", | |
"| GetMorgl| 86|\n", | |
"| Hiring| 86|\n", | |
"| Sales| 80|\n", | |
"| hiring| 79|\n", | |
"| hadoop| 77|\n", | |
"| jobs| 70|\n", | |
"| oracle| 59|\n", | |
"| Job| 55|\n", | |
"| Jobs| 51|\n", | |
"| bigdata| 51|\n", | |
"| Spark| 51|\n", | |
"| Cloud| 33|\n", | |
"| Java| 31|\n", | |
"| IT| 30|\n", | |
"+-----------+----------+\n", | |
"only showing top 20 rows\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"sqlContext.sql(\"SELECT hashtag.text,count(*) as inst_count from twitter LATERAL VIEW OUTER explode(entities.hashtags) hashtagsTable AS hashtag GROUP BY hashtag.text order by inst_count desc\").show()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment