Skip to content

Instantly share code, notes, and snippets.

@danking
Last active February 2, 2024 15:15
Show Gist options
  • Save danking/4845af38928e90ff943c006f3682e9af to your computer and use it in GitHub Desktop.
Save danking/4845af38928e90ff943c006f3682e9af to your computer and use it in GitHub Desktop.
Table Joins Tutorial (0 + 1) / 1]
--------------------
This tutorial walks through some ways to join Hail tables. We'll use a simple movie dataset to illustrate. The movie dataset comes in multiple parts. Here are a few questions we might naturally ask about the dataset:
- What is the mean rating per genre?
- What is the favorite movie for each occupation?
- What genres are most preferred by women vs men?
We'll use joins to combine datasets in order to answer these questions.
Let's initialize Hail, fetch the tutorial data, and load three tables: users, movies, and ratings.
..
.. nbinput:: ipython3
:execution-count: 1
import hail as hl
hl.utils.get_movie_lens('data/')
users = hl.read_table('data/users.ht')
movies = hl.read_table('data/movies.ht')
ratings = hl.read_table('data/ratings.ht')
..
.. nboutput::
:more-to-come:
:fancy:
:class: rendered_html
.. raw:: html
<style>
.bk-notebook-logo {
display: block;
width: 20px;
height: 20px;
background-image: url();
}
</style>
<div>
<a href="https://bokeh.org" target="_blank" class="bk-notebook-logo"></a>
<span id="d82158bd-b9c5-4fd3-b173-5f977b2ec596">Loading BokehJS ...</span>
</div>
..
.. only:: html
.. nboutput::
:more-to-come:
:fancy:
.. raw:: html
<div class="output_javascript"></div>
<script type="text/javascript">
var element = document.currentScript.previousSibling.previousSibling;
(function(root) {
function now() {
return new Date();
}
const force = true;
if (typeof root._bokeh_onload_callbacks === "undefined" || force === true) {
root._bokeh_onload_callbacks = [];
root._bokeh_is_loading = undefined;
}
const JS_MIME_TYPE = 'application/javascript';
const HTML_MIME_TYPE = 'text/html';
const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';
const CLASS_NAME = 'output_bokeh rendered_html';
/**
* Render data to the DOM node
*/
function render(props, node) {
const script = document.createElement("script");
node.appendChild(script);
}
/**
* Handle when an output is cleared or removed
*/
function handleClearOutput(event, handle) {
function drop(id) {
const view = Bokeh.index.get_by_id(id)
if (view != null) {
view.model.document.clear()
Bokeh.index.delete(view)
}
}
const cell = handle.cell;
const id = cell.output_area._bokeh_element_id;
const server_id = cell.output_area._bokeh_server_id;
// Clean up Bokeh references
if (id != null) {
drop(id)
}
if (server_id !== undefined) {
// Clean up Bokeh references
const cmd_clean = "from bokeh.io.state import curstate; print(curstate().uuid_to_server['" + server_id + "'].get_sessions()[0].document.roots[0]._id)";
cell.notebook.kernel.execute(cmd_clean, {
iopub: {
output: function(msg) {
const id = msg.content.text.trim()
drop(id)
}
}
});
// Destroy server and session
const cmd_destroy = "import bokeh.io.notebook as ion; ion.destroy_server('" + server_id + "')";
cell.notebook.kernel.execute(cmd_destroy);
}
}
/**
* Handle when a new output is added
*/
function handleAddOutput(event, handle) {
const output_area = handle.output_area;
const output = handle.output;
// limit handleAddOutput to display_data with EXEC_MIME_TYPE content only
if ((output.output_type != "display_data") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {
return
}
const toinsert = output_area.element.find("." + CLASS_NAME.split(' ')[0]);
if (output.metadata[EXEC_MIME_TYPE]["id"] !== undefined) {
toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];
// store reference to embed id on output_area
output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE]["id"];
}
if (output.metadata[EXEC_MIME_TYPE]["server_id"] !== undefined) {
const bk_div = document.createElement("div");
bk_div.innerHTML = output.data[HTML_MIME_TYPE];
const script_attrs = bk_div.children[0].attributes;
for (let i = 0; i < script_attrs.length; i++) {
toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);
toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent
}
// store reference to server id on output_area
output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE]["server_id"];
}
}
function register_renderer(events, OutputArea) {
function append_mime(data, metadata, element) {
// create a DOM node to render to
const toinsert = this.create_output_subarea(
metadata,
CLASS_NAME,
EXEC_MIME_TYPE
);
this.keyboard_manager.register_events(toinsert);
// Render to node
const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};
render(props, toinsert[toinsert.length - 1]);
element.append(toinsert);
return toinsert
}
/* Handle when an output is cleared or removed */
events.on('clear_output.CodeCell', handleClearOutput);
events.on('delete.Cell', handleClearOutput);
/* Handle when a new output is added */
events.on('output_added.OutputArea', handleAddOutput);
/**
* Register the mime type and append_mime function with output_area
*/
OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {
/* Is output safe? */
safe: true,
/* Index of renderer in `output_area.display_order` */
index: 0
});
}
// register the mime type if in Jupyter Notebook environment and previously unregistered
if (root.Jupyter !== undefined) {
const events = require('base/js/events');
const OutputArea = require('notebook/js/outputarea').OutputArea;
if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {
register_renderer(events, OutputArea);
}
}
if (typeof (root._bokeh_timeout) === "undefined" || force === true) {
root._bokeh_timeout = Date.now() + 5000;
root._bokeh_failed_load = false;
}
const NB_LOAD_WARNING = {'data': {'text/html':
"<div style='background-color: #fdd'>\n"+
"<p>\n"+
"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \n"+
"may be due to a slow or bad network connection. Possible fixes:\n"+
"</p>\n"+
"<ul>\n"+
"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\n"+
"<li>use INLINE resources instead, as so:</li>\n"+
"</ul>\n"+
"<code>\n"+
"from bokeh.resources import INLINE\n"+
"output_notebook(resources=INLINE)\n"+
"</code>\n"+
"</div>"}};
function display_loaded() {
const el = document.getElementById("d82158bd-b9c5-4fd3-b173-5f977b2ec596");
if (el != null) {
el.textContent = "BokehJS is loading...";
}
if (root.Bokeh !== undefined) {
if (el != null) {
el.textContent = "BokehJS " + root.Bokeh.version + " successfully loaded.";
}
} else if (Date.now() < root._bokeh_timeout) {
setTimeout(display_loaded, 100)
}
}
function run_callbacks() {
try {
root._bokeh_onload_callbacks.forEach(function(callback) {
if (callback != null)
callback();
});
} finally {
delete root._bokeh_onload_callbacks
}
console.debug("Bokeh: all callbacks have finished");
}
function load_libs(css_urls, js_urls, callback) {
if (css_urls == null) css_urls = [];
if (js_urls == null) js_urls = [];
root._bokeh_onload_callbacks.push(callback);
if (root._bokeh_is_loading > 0) {
console.debug("Bokeh: BokehJS is being loaded, scheduling callback at", now());
return null;
}
if (js_urls == null || js_urls.length === 0) {
run_callbacks();
return null;
}
console.debug("Bokeh: BokehJS not loaded, scheduling load and callback at", now());
root._bokeh_is_loading = css_urls.length + js_urls.length;
function on_load() {
root._bokeh_is_loading--;
if (root._bokeh_is_loading === 0) {
console.debug("Bokeh: all BokehJS libraries/stylesheets loaded");
run_callbacks()
}
}
function on_error(url) {
console.error("failed to load " + url);
}
for (let i = 0; i < css_urls.length; i++) {
const url = css_urls[i];
const element = document.createElement("link");
element.onload = on_load;
element.onerror = on_error.bind(null, url);
element.rel = "stylesheet";
element.type = "text/css";
element.href = url;
console.debug("Bokeh: injecting link tag for BokehJS stylesheet: ", url);
document.body.appendChild(element);
}
for (let i = 0; i < js_urls.length; i++) {
const url = js_urls[i];
const element = document.createElement('script');
element.onload = on_load;
element.onerror = on_error.bind(null, url);
element.async = false;
element.src = url;
console.debug("Bokeh: injecting script tag for BokehJS library: ", url);
document.head.appendChild(element);
}
};
function inject_raw_css(css) {
const element = document.createElement("style");
element.appendChild(document.createTextNode(css));
document.body.appendChild(element);
}
const js_urls = ["https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js", "https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js"];
const css_urls = [];
const inline_js = [ function(Bokeh) {
Bokeh.set_log_level("info");
},
function(Bokeh) {
}
];
function run_inline_js() {
if (root.Bokeh !== undefined || force === true) {
for (let i = 0; i < inline_js.length; i++) {
inline_js[i].call(root, root.Bokeh);
}
if (force === true) {
display_loaded();
}} else if (Date.now() < root._bokeh_timeout) {
setTimeout(run_inline_js, 100);
} else if (!root._bokeh_failed_load) {
console.log("Bokeh: BokehJS failed to load within specified timeout.");
root._bokeh_failed_load = true;
} else if (force !== true) {
const cell = $(document.getElementById("d82158bd-b9c5-4fd3-b173-5f977b2ec596")).parents('.cell').data().cell;
cell.output_area.append_execute_result(NB_LOAD_WARNING)
}
}
if (root._bokeh_is_loading === 0) {
console.debug("Bokeh: BokehJS loaded, going straight to plotting");
run_inline_js();
} else {
load_libs(css_urls, js_urls, function() {
console.debug("Bokeh: BokehJS plotting callback run at", now());
run_inline_js();
});
}
}(window));
</script>
.. only:: latex
.. nboutput::
:more-to-come:
:fancy:
.. nbwarning:: Data type cannot be displayed: application/javascript, application/vnd.bokehjs_load.v0+json
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
Initializing Hail with default parameters...
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
Initializing Hail with default parameters{\ldots}
\end{sphinxVerbatim}
.. raw:: text
Initializing Hail with default parameters...
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
/Users/dking/projects/hail/hail/python/hailtop/aiocloud/aiogoogle/user_config.py:29: UserWarning:
You have specified the GCS requester pays configuration in both your spark-defaults.conf (/Users/dking/miniconda3/lib/python3.10/site-packages/pyspark/conf/spark-defaults.conf) and either an explicit argument or through `hailctl config`. For GCS requester pays configuration, Hail first checks explicit arguments, then `hailctl config`, then spark-defaults.conf.
/Users/dking/projects/hail/hail/python/hail/backend/backend.py:63: UserWarning:
!!! THIS IS A DEVELOPMENT VERSION OF HAIL !!!
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
/Users/dking/projects/hail/hail/python/hailtop/aiocloud/aiogoogle/user\_config.py:29: UserWarning:
You have specified the GCS requester pays configuration in both your spark-defaults.conf (/Users/dking/miniconda3/lib/python3.10/site-packages/pyspark/conf/spark-defaults.conf) and either an explicit argument or through `hailctl config`. For GCS requester pays configuration, Hail first checks explicit arguments, then `hailctl config`, then spark-defaults.conf.
/Users/dking/projects/hail/hail/python/hail/backend/backend.py:63: UserWarning:
!!! THIS IS A DEVELOPMENT VERSION OF HAIL !!!
\end{sphinxVerbatim}
.. raw:: text
/Users/dking/projects/hail/hail/python/hailtop/aiocloud/aiogoogle/user_config.py:29: UserWarning:
You have specified the GCS requester pays configuration in both your spark-defaults.conf (/Users/dking/miniconda3/lib/python3.10/site-packages/pyspark/conf/spark-defaults.conf) and either an explicit argument or through `hailctl config`. For GCS requester pays configuration, Hail first checks explicit arguments, then `hailctl config`, then spark-defaults.conf.
/Users/dking/projects/hail/hail/python/hail/backend/backend.py:63: UserWarning:
!!! THIS IS A DEVELOPMENT VERSION OF HAIL !!!
..
.. nboutput::
:more-to-come:
.. rst-class:: highlight
.. raw:: html
<pre>
24/02/02 10:12:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
24/02/02 10:12:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform{\ldots} using builtin-java classes where applicable
\end{sphinxVerbatim}
.. raw:: text
24/02/02 10:12:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
Setting default log level to &#34;WARN&#34;.
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
\end{sphinxVerbatim}
.. raw:: text
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
..
.. nboutput::
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
Running on Apache Spark version 3.3.2
SparkUI available at http://192.168.1.140:4040
Welcome to
__ __ &lt;&gt;__
/ /_/ /__ __/ /
/ __ / _ `/ / /
/_/ /_/\_,_/_/_/ version 0.2.127-bb04f05412f0
LOGGING: writing to /Users/dking/projects/hail/hail-20240202-1012-0.2.127-bb04f05412f0.log
2024-02-02 10:12:45.158 Hail: INFO: Movie Lens files found!
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
Running on Apache Spark version 3.3.2
SparkUI available at http://192.168.1.140:4040
Welcome to
\_\_ \_\_ <>\_\_
/ /\_/ /\_\_ \_\_/ /
/ \_\_ / \_ `/ / /
/\_/ /\_/\textbackslash{}\_,\_/\_/\_/ version 0.2.127-bb04f05412f0
LOGGING: writing to /Users/dking/projects/hail/hail-20240202-1012-0.2.127-bb04f05412f0.log
2024-02-02 10:12:45.158 Hail: INFO: Movie Lens files found!
\end{sphinxVerbatim}
.. raw:: text
Running on Apache Spark version 3.3.2
SparkUI available at http://192.168.1.140:4040
Welcome to
__ __ <>__
/ /_/ /__ __/ /
/ __ / _ `/ / /
/_/ /_/\_,_/_/_/ version 0.2.127-bb04f05412f0
LOGGING: writing to /Users/dking/projects/hail/hail-20240202-1012-0.2.127-bb04f05412f0.log
2024-02-02 10:12:45.158 Hail: INFO: Movie Lens files found!
..
The Key to Understanding Joins
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
To understand joins in Hail, we need to revisit one of the crucial properties of tables: the key.
A table has an ordered list of fields known as the key. Our ``users`` table has one key, the ``id`` field. We can see all the fields, as well as the keys, of a table by calling ``describe()``.
..
.. nbinput:: ipython3
:execution-count: 2
users.describe()
..
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
&#39;id&#39;: int32
&#39;age&#39;: int32
&#39;sex&#39;: str
&#39;occupation&#39;: str
&#39;zipcode&#39;: str
----------------------------------------
Key: [&#39;id&#39;]
----------------------------------------
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'id': int32
'age': int32
'sex': str
'occupation': str
'zipcode': str
----------------------------------------
Key: ['id']
----------------------------------------
\end{sphinxVerbatim}
.. raw:: text
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'id': int32
'age': int32
'sex': str
'occupation': str
'zipcode': str
----------------------------------------
Key: ['id']
----------------------------------------
..
``key`` is a struct expression of all of the key fields, so we can refer to the key of a table without explicitly specifying the names of the key fields.
..
.. nbinput:: ipython3
:execution-count: 3
users.key.describe()
..
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
--------------------------------------------------------
Type:
struct {
id: int32
}
--------------------------------------------------------
Source:
&lt;hail.table.Table object at 0x2863a3ca0&gt;
Index:
[&#39;row&#39;]
--------------------------------------------------------
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
--------------------------------------------------------
Type:
struct \{
id: int32
\}
--------------------------------------------------------
Source:
<hail.table.Table object at 0x2863a3ca0>
Index:
['row']
--------------------------------------------------------
\end{sphinxVerbatim}
.. raw:: text
--------------------------------------------------------
Type:
struct {
id: int32
}
--------------------------------------------------------
Source:
<hail.table.Table object at 0x2863a3ca0>
Index:
['row']
--------------------------------------------------------
..
Keys need not be unique or non-missing, although in many applications they will be both.
When tables are joined in Hail, they are joined based on their keys. In order to join two tables, they must share the same number of keys, same key types (i.e. string vs integer), and the same order of keys.
..
Let's look at a simple example of a join. We'll use the ``Table.parallelize()`` method to create two small tables, ``t1`` and ``t2``.
..
.. nbinput:: ipython3
:execution-count: 4
:no-output:
t1 = hl.Table.parallelize([
{'a': 'foo', 'b': 1},
{'a': 'bar', 'b': 2},
{'a': 'bar', 'b': 2}],
hl.tstruct(a=hl.tstr, b=hl.tint32),
key='a')
t2 = hl.Table.parallelize([
{'t': 'foo', 'x': 3.14},
{'t': 'bar', 'x': 2.78},
{'t': 'bar', 'x': -1},
{'t': 'quam', 'x': 0}],
hl.tstruct(t=hl.tstr, x=hl.tfloat64),
key='t')
..
.. nbinput:: ipython3
:execution-count: 5
t1.show()
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 0:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 0:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 0:> (0 + 1) / 1]
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">a</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">b</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;foo&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
</tbody></table>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+-------+-------+
| a | b |
+-------+-------+
| str | int32 |
+-------+-------+
| &#34;bar&#34; | 2 |
| &#34;bar&#34; | 2 |
| &#34;foo&#34; | 1 |
+-------+-------+
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+-------+-------+
| a | b |
+-------+-------+
| str | int32 |
+-------+-------+
| "bar" | 2 |
| "bar" | 2 |
| "foo" | 1 |
+-------+-------+
\end{sphinxVerbatim}
.. raw:: text
+-------+-------+
| a | b |
+-------+-------+
| str | int32 |
+-------+-------+
| "bar" | 2 |
| "bar" | 2 |
| "foo" | 1 |
+-------+-------+
..
.. nbinput:: ipython3
:execution-count: 6
t2.show()
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">t</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">x</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.78e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">-1.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;foo&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.14e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;quam&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">0.00e+00</td></tr>
</tbody></table>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+--------+-----------+
| t | x |
+--------+-----------+
| str | float64 |
+--------+-----------+
| &#34;bar&#34; | 2.78e+00 |
| &#34;bar&#34; | -1.00e+00 |
| &#34;foo&#34; | 3.14e+00 |
| &#34;quam&#34; | 0.00e+00 |
+--------+-----------+
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+--------+-----------+
| t | x |
+--------+-----------+
| str | float64 |
+--------+-----------+
| "bar" | 2.78e+00 |
| "bar" | -1.00e+00 |
| "foo" | 3.14e+00 |
| "quam" | 0.00e+00 |
+--------+-----------+
\end{sphinxVerbatim}
.. raw:: text
+--------+-----------+
| t | x |
+--------+-----------+
| str | float64 |
+--------+-----------+
| "bar" | 2.78e+00 |
| "bar" | -1.00e+00 |
| "foo" | 3.14e+00 |
| "quam" | 0.00e+00 |
+--------+-----------+
..
Now, we can join the tables.
..
.. nbinput:: ipython3
:execution-count: 7
j = t1.annotate(t2_x = t2[t1.a].x)
j.show()
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 2:&gt; (0 + 10) / 10]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 2:> (0 + 10) / 10]
\end{sphinxVerbatim}
.. raw:: text
[Stage 2:> (0 + 10) / 10]
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">a</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">b</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">t2_x</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.78e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.78e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;foo&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.14e+00</td></tr>
</tbody></table>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+-------+-------+----------+
| a | b | t2_x |
+-------+-------+----------+
| str | int32 | float64 |
+-------+-------+----------+
| &#34;bar&#34; | 2 | 2.78e+00 |
| &#34;bar&#34; | 2 | 2.78e+00 |
| &#34;foo&#34; | 1 | 3.14e+00 |
+-------+-------+----------+
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+-------+-------+----------+
| a | b | t2\_x |
+-------+-------+----------+
| str | int32 | float64 |
+-------+-------+----------+
| "bar" | 2 | 2.78e+00 |
| "bar" | 2 | 2.78e+00 |
| "foo" | 1 | 3.14e+00 |
+-------+-------+----------+
\end{sphinxVerbatim}
.. raw:: text
+-------+-------+----------+
| a | b | t2_x |
+-------+-------+----------+
| str | int32 | float64 |
+-------+-------+----------+
| "bar" | 2 | 2.78e+00 |
| "bar" | 2 | 2.78e+00 |
| "foo" | 1 | 3.14e+00 |
+-------+-------+----------+
..
Let's break this syntax down.
``t2[t1.a]`` is an expression referring to the row of table ``t2`` with value ``t1.a``. So this expression will create a map between the keys of ``t1`` and the rows of ``t2``. You can view this mapping directly:
..
.. nbinput:: ipython3
:execution-count: 8
t2[t1.a].show()
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">&lt;expr&gt;</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">a</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">x</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.78e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;bar&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.78e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;foo&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.14e+00</td></tr>
</tbody></table>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+-------+----------+
| a | &lt;expr&gt;.x |
+-------+----------+
| str | float64 |
+-------+----------+
| &#34;bar&#34; | 2.78e+00 |
| &#34;bar&#34; | 2.78e+00 |
| &#34;foo&#34; | 3.14e+00 |
+-------+----------+
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+-------+----------+
| a | <expr>.x |
+-------+----------+
| str | float64 |
+-------+----------+
| "bar" | 2.78e+00 |
| "bar" | 2.78e+00 |
| "foo" | 3.14e+00 |
+-------+----------+
\end{sphinxVerbatim}
.. raw:: text
+-------+----------+
| a | <expr>.x |
+-------+----------+
| str | float64 |
+-------+----------+
| "bar" | 2.78e+00 |
| "bar" | 2.78e+00 |
| "foo" | 3.14e+00 |
+-------+----------+
..
Since we only want the field ``x`` from ``t2``, we can select it with ``t2[t1.a].x``. Then we add this field to ``t1`` with the ``anntotate_rows()`` method. The new joined table ``j`` has a field ``t2_x`` that comes from the rows of ``t2``. The tables could be joined, because they shared the same number of keys (1) and the same key type (string). The keys do not need to share the same name. Notice that the rows with keys present in ``t2`` but not in ``t1`` do not show up in the final result.
This join syntax performs a left join. Tables also have a SQL-style inner/left/right/outer `join() <https://hail.is/docs/0.2/hail.Table.html#hail.Table.join>`__ method.
The magic of keys is that they can be used to create a mapping, like a Python dictionary, between the keys of one table and the row values of another table: ``table[expr]`` will refer to the row of ``table`` that has a key value of ``expr``. If the row is not unique, one such row is chosen arbitrarily.
Here's a subtle bit: if ``expr`` is an expression indexed by a row of ``table2``, then ``table[expr]`` is also an expression indexed by a row of ``table2``.
Also note that while they look similar, ``table['field']`` and ``table1[table2.key]`` are doing very different things!
``table['field']`` selects a field from the table, while ``table1[table2.key]`` creates a mapping between the keys of ``table2`` and the rows of ``table1``.
..
.. nbinput:: ipython3
:execution-count: 9
t1['a'].describe()
..
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
--------------------------------------------------------
Type:
str
--------------------------------------------------------
Source:
&lt;hail.table.Table object at 0x28773a890&gt;
Index:
[&#39;row&#39;]
--------------------------------------------------------
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
--------------------------------------------------------
Type:
str
--------------------------------------------------------
Source:
<hail.table.Table object at 0x28773a890>
Index:
['row']
--------------------------------------------------------
\end{sphinxVerbatim}
.. raw:: text
--------------------------------------------------------
Type:
str
--------------------------------------------------------
Source:
<hail.table.Table object at 0x28773a890>
Index:
['row']
--------------------------------------------------------
..
.. nbinput:: ipython3
:execution-count: 10
t2[t1.a].describe()
..
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
--------------------------------------------------------
Type:
struct {
x: float64
}
--------------------------------------------------------
Source:
&lt;hail.table.Table object at 0x28773a890&gt;
Index:
[&#39;row&#39;]
--------------------------------------------------------
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
--------------------------------------------------------
Type:
struct \{
x: float64
\}
--------------------------------------------------------
Source:
<hail.table.Table object at 0x28773a890>
Index:
['row']
--------------------------------------------------------
\end{sphinxVerbatim}
.. raw:: text
--------------------------------------------------------
Type:
struct {
x: float64
}
--------------------------------------------------------
Source:
<hail.table.Table object at 0x28773a890>
Index:
['row']
--------------------------------------------------------
..
Joining Tables
~~~~~~~~~~~~~~
Now that we understand the basics of how joins work, let's use a join to compute the average movie rating per genre.
We have a table ``ratings``, which contains ``user_id``, ``movie_id``, and ``rating`` fields. Group by ``movie_id`` and aggregate to get the mean rating of each movie.
..
.. nbinput:: ipython3
:execution-count: 11
t = (ratings.group_by(ratings.movie_id)
.aggregate(rating = hl.agg.mean(ratings.rating)))
t.describe()
..
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
&#39;movie_id&#39;: int32
&#39;rating&#39;: float64
----------------------------------------
Key: [&#39;movie_id&#39;]
----------------------------------------
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'movie\_id': int32
'rating': float64
----------------------------------------
Key: ['movie\_id']
----------------------------------------
\end{sphinxVerbatim}
.. raw:: text
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'movie_id': int32
'rating': float64
----------------------------------------
Key: ['movie_id']
----------------------------------------
..
To get the mean rating by genre, we need to join in the genre field from the ``movies`` table.
..
.. nbinput:: ipython3
:execution-count: 12
t = t.annotate(genres = movies[t.movie_id].genres)
t.describe()
..
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
&#39;movie_id&#39;: int32
&#39;rating&#39;: float64
&#39;genres&#39;: array&lt;str&gt;
----------------------------------------
Key: [&#39;movie_id&#39;]
----------------------------------------
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'movie\_id': int32
'rating': float64
'genres': array<str>
----------------------------------------
Key: ['movie\_id']
----------------------------------------
\end{sphinxVerbatim}
.. raw:: text
----------------------------------------
Global fields:
None
----------------------------------------
Row fields:
'movie_id': int32
'rating': float64
'genres': array<str>
----------------------------------------
Key: ['movie_id']
----------------------------------------
..
.. nbinput:: ipython3
:execution-count: 13
t.show()
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 10:==========================================================(1 + 0) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 10:==========================================================(1 + 0) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 10:==========================================================(1 + 0) / 1]
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 11:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 11:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 11:> (0 + 1) / 1]
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie_id</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">rating</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">genres</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">array&lt;str&gt;</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.88e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Animation&quot;,&quot;Children&#x27;s&quot;,&quot;Comedy&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.21e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Action&quot;,&quot;Adventure&quot;,&quot;Thriller&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.03e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Thriller&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.55e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Action&quot;,&quot;Comedy&quot;,&quot;Drama&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.30e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Crime&quot;,&quot;Drama&quot;,&quot;Thriller&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">6</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.58e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Drama&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">7</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.80e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Drama&quot;,&quot;Sci-Fi&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">8</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4.00e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Children&#x27;s&quot;,&quot;Comedy&quot;,&quot;Drama&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">9</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.90e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Drama&quot;]</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">10</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.83e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">[&quot;Drama&quot;,&quot;War&quot;]</td></tr>
</tbody></table><p style="background: #fdd; padding: 0.4em;">showing top 10 rows</p>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+----------+----------+-------------------------------------+
| movie_id | rating | genres |
+----------+----------+-------------------------------------+
| int32 | float64 | array&lt;str&gt; |
+----------+----------+-------------------------------------+
| 1 | 3.88e+00 | [&#34;Animation&#34;,&#34;Children&#39;s&#34;,&#34;Comedy&#34;] |
| 2 | 3.21e+00 | [&#34;Action&#34;,&#34;Adventure&#34;,&#34;Thriller&#34;] |
| 3 | 3.03e+00 | [&#34;Thriller&#34;] |
| 4 | 3.55e+00 | [&#34;Action&#34;,&#34;Comedy&#34;,&#34;Drama&#34;] |
| 5 | 3.30e+00 | [&#34;Crime&#34;,&#34;Drama&#34;,&#34;Thriller&#34;] |
| 6 | 3.58e+00 | [&#34;Drama&#34;] |
| 7 | 3.80e+00 | [&#34;Drama&#34;,&#34;Sci-Fi&#34;] |
| 8 | 4.00e+00 | [&#34;Children&#39;s&#34;,&#34;Comedy&#34;,&#34;Drama&#34;] |
| 9 | 3.90e+00 | [&#34;Drama&#34;] |
| 10 | 3.83e+00 | [&#34;Drama&#34;,&#34;War&#34;] |
+----------+----------+-------------------------------------+
showing top 10 rows
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+----------+----------+-------------------------------------+
| movie\_id | rating | genres |
+----------+----------+-------------------------------------+
| int32 | float64 | array<str> |
+----------+----------+-------------------------------------+
| 1 | 3.88e+00 | ["Animation","Children's","Comedy"] |
| 2 | 3.21e+00 | ["Action","Adventure","Thriller"] |
| 3 | 3.03e+00 | ["Thriller"] |
| 4 | 3.55e+00 | ["Action","Comedy","Drama"] |
| 5 | 3.30e+00 | ["Crime","Drama","Thriller"] |
| 6 | 3.58e+00 | ["Drama"] |
| 7 | 3.80e+00 | ["Drama","Sci-Fi"] |
| 8 | 4.00e+00 | ["Children's","Comedy","Drama"] |
| 9 | 3.90e+00 | ["Drama"] |
| 10 | 3.83e+00 | ["Drama","War"] |
+----------+----------+-------------------------------------+
showing top 10 rows
\end{sphinxVerbatim}
.. raw:: text
+----------+----------+-------------------------------------+
| movie_id | rating | genres |
+----------+----------+-------------------------------------+
| int32 | float64 | array<str> |
+----------+----------+-------------------------------------+
| 1 | 3.88e+00 | ["Animation","Children's","Comedy"] |
| 2 | 3.21e+00 | ["Action","Adventure","Thriller"] |
| 3 | 3.03e+00 | ["Thriller"] |
| 4 | 3.55e+00 | ["Action","Comedy","Drama"] |
| 5 | 3.30e+00 | ["Crime","Drama","Thriller"] |
| 6 | 3.58e+00 | ["Drama"] |
| 7 | 3.80e+00 | ["Drama","Sci-Fi"] |
| 8 | 4.00e+00 | ["Children's","Comedy","Drama"] |
| 9 | 3.90e+00 | ["Drama"] |
| 10 | 3.83e+00 | ["Drama","War"] |
+----------+----------+-------------------------------------+
showing top 10 rows
..
We want to group the ratings by genre, but they're packed up in an array. To unpack the genres, we can use `explode <https://hail.is/docs/0.2/hail.Table.html#hail.Table.explode>`__.
``explode`` creates a new row for each element in the value of the field, which must be a collection (array or set).
..
.. nbinput:: ipython3
:execution-count: 14
t = t.explode(t.genres)
t.show()
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie_id</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">rating</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">genres</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.88e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Animation&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.88e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Children&#x27;s&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.88e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Comedy&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.21e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Action&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.21e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Adventure&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.21e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Thriller&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.03e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Thriller&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.55e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Action&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.55e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Comedy&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.55e+00</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Drama&quot;</td></tr>
</tbody></table><p style="background: #fdd; padding: 0.4em;">showing top 10 rows</p>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+----------+----------+--------------+
| movie_id | rating | genres |
+----------+----------+--------------+
| int32 | float64 | str |
+----------+----------+--------------+
| 1 | 3.88e+00 | &#34;Animation&#34; |
| 1 | 3.88e+00 | &#34;Children&#39;s&#34; |
| 1 | 3.88e+00 | &#34;Comedy&#34; |
| 2 | 3.21e+00 | &#34;Action&#34; |
| 2 | 3.21e+00 | &#34;Adventure&#34; |
| 2 | 3.21e+00 | &#34;Thriller&#34; |
| 3 | 3.03e+00 | &#34;Thriller&#34; |
| 4 | 3.55e+00 | &#34;Action&#34; |
| 4 | 3.55e+00 | &#34;Comedy&#34; |
| 4 | 3.55e+00 | &#34;Drama&#34; |
+----------+----------+--------------+
showing top 10 rows
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+----------+----------+--------------+
| movie\_id | rating | genres |
+----------+----------+--------------+
| int32 | float64 | str |
+----------+----------+--------------+
| 1 | 3.88e+00 | "Animation" |
| 1 | 3.88e+00 | "Children's" |
| 1 | 3.88e+00 | "Comedy" |
| 2 | 3.21e+00 | "Action" |
| 2 | 3.21e+00 | "Adventure" |
| 2 | 3.21e+00 | "Thriller" |
| 3 | 3.03e+00 | "Thriller" |
| 4 | 3.55e+00 | "Action" |
| 4 | 3.55e+00 | "Comedy" |
| 4 | 3.55e+00 | "Drama" |
+----------+----------+--------------+
showing top 10 rows
\end{sphinxVerbatim}
.. raw:: text
+----------+----------+--------------+
| movie_id | rating | genres |
+----------+----------+--------------+
| int32 | float64 | str |
+----------+----------+--------------+
| 1 | 3.88e+00 | "Animation" |
| 1 | 3.88e+00 | "Children's" |
| 1 | 3.88e+00 | "Comedy" |
| 2 | 3.21e+00 | "Action" |
| 2 | 3.21e+00 | "Adventure" |
| 2 | 3.21e+00 | "Thriller" |
| 3 | 3.03e+00 | "Thriller" |
| 4 | 3.55e+00 | "Action" |
| 4 | 3.55e+00 | "Comedy" |
| 4 | 3.55e+00 | "Drama" |
+----------+----------+--------------+
showing top 10 rows
..
Finally, we can get group by genre and aggregate to get the mean rating per genre.
..
.. nbinput:: ipython3
:execution-count: 15
t = (t.group_by(t.genres)
.aggregate(rating = hl.agg.mean(t.rating)))
t.show(n=100)
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">genres</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">rating</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Action&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.97e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Adventure&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.14e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Animation&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.30e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Children&#x27;s&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.92e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Comedy&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Crime&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.21e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Documentary&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.23e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Drama&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.19e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Fantasy&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.85e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Film-Noir&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.55e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Horror&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.73e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Musical&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.38e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Mystery&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.34e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Romance&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.24e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Sci-Fi&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.17e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Thriller&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.14e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;War&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.49e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Western&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.19e+00</td></tr>
</tbody></table>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+---------------+----------+
| genres | rating |
+---------------+----------+
| str | float64 |
+---------------+----------+
| &#34;Action&#34; | 2.97e+00 |
| &#34;Adventure&#34; | 3.14e+00 |
| &#34;Animation&#34; | 3.30e+00 |
| &#34;Children&#39;s&#34; | 2.92e+00 |
| &#34;Comedy&#34; | 3.00e+00 |
| &#34;Crime&#34; | 3.21e+00 |
| &#34;Documentary&#34; | 3.23e+00 |
| &#34;Drama&#34; | 3.19e+00 |
| &#34;Fantasy&#34; | 2.85e+00 |
| &#34;Film-Noir&#34; | 3.55e+00 |
| &#34;Horror&#34; | 2.73e+00 |
| &#34;Musical&#34; | 3.38e+00 |
| &#34;Mystery&#34; | 3.34e+00 |
| &#34;Romance&#34; | 3.24e+00 |
| &#34;Sci-Fi&#34; | 3.17e+00 |
| &#34;Thriller&#34; | 3.14e+00 |
| &#34;War&#34; | 3.49e+00 |
| &#34;Western&#34; | 3.19e+00 |
+---------------+----------+
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+---------------+----------+
| genres | rating |
+---------------+----------+
| str | float64 |
+---------------+----------+
| "Action" | 2.97e+00 |
| "Adventure" | 3.14e+00 |
| "Animation" | 3.30e+00 |
| "Children's" | 2.92e+00 |
| "Comedy" | 3.00e+00 |
| "Crime" | 3.21e+00 |
| "Documentary" | 3.23e+00 |
| "Drama" | 3.19e+00 |
| "Fantasy" | 2.85e+00 |
| "Film-Noir" | 3.55e+00 |
| "Horror" | 2.73e+00 |
| "Musical" | 3.38e+00 |
| "Mystery" | 3.34e+00 |
| "Romance" | 3.24e+00 |
| "Sci-Fi" | 3.17e+00 |
| "Thriller" | 3.14e+00 |
| "War" | 3.49e+00 |
| "Western" | 3.19e+00 |
+---------------+----------+
\end{sphinxVerbatim}
.. raw:: text
+---------------+----------+
| genres | rating |
+---------------+----------+
| str | float64 |
+---------------+----------+
| "Action" | 2.97e+00 |
| "Adventure" | 3.14e+00 |
| "Animation" | 3.30e+00 |
| "Children's" | 2.92e+00 |
| "Comedy" | 3.00e+00 |
| "Crime" | 3.21e+00 |
| "Documentary" | 3.23e+00 |
| "Drama" | 3.19e+00 |
| "Fantasy" | 2.85e+00 |
| "Film-Noir" | 3.55e+00 |
| "Horror" | 2.73e+00 |
| "Musical" | 3.38e+00 |
| "Mystery" | 3.34e+00 |
| "Romance" | 3.24e+00 |
| "Sci-Fi" | 3.17e+00 |
| "Thriller" | 3.14e+00 |
| "War" | 3.49e+00 |
| "Western" | 3.19e+00 |
+---------------+----------+
..
Let's do another example. This time, we'll see if we can determine what the highest rated movies are, on average, for each occupation. We start by joining the two tables ``movies`` and ``users.``
..
.. nbinput:: ipython3
:execution-count: 16
movie_data = ratings.annotate(
movie = movies[ratings.movie_id].title,
occupation = users[ratings.user_id].occupation)
movie_data.show()
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">user_id</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie_id</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">rating</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">occupation</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int32</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Toy Story (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;GoldenEye (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Four Rooms (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Get Shorty (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Copycat (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">6</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">7</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Twelve Monkeys (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">8</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Babe (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">9</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Dead Man Walking (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">10</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Richard III (1995)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;technician&quot;</td></tr>
</tbody></table><p style="background: #fdd; padding: 0.4em;">showing top 10 rows</p>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+---------+----------+--------+
| user_id | movie_id | rating |
+---------+----------+--------+
| int32 | int32 | int32 |
+---------+----------+--------+
| 1 | 1 | 5 |
| 1 | 2 | 3 |
| 1 | 3 | 4 |
| 1 | 4 | 3 |
| 1 | 5 | 3 |
| 1 | 6 | 5 |
| 1 | 7 | 4 |
| 1 | 8 | 1 |
| 1 | 9 | 5 |
| 1 | 10 | 3 |
+---------+----------+--------+
+--------------------------------------------------------+--------------+
| movie | occupation |
+--------------------------------------------------------+--------------+
| str | str |
+--------------------------------------------------------+--------------+
| &#34;Toy Story (1995)&#34; | &#34;technician&#34; |
| &#34;GoldenEye (1995)&#34; | &#34;technician&#34; |
| &#34;Four Rooms (1995)&#34; | &#34;technician&#34; |
| &#34;Get Shorty (1995)&#34; | &#34;technician&#34; |
| &#34;Copycat (1995)&#34; | &#34;technician&#34; |
| &#34;Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)&#34; | &#34;technician&#34; |
| &#34;Twelve Monkeys (1995)&#34; | &#34;technician&#34; |
| &#34;Babe (1995)&#34; | &#34;technician&#34; |
| &#34;Dead Man Walking (1995)&#34; | &#34;technician&#34; |
| &#34;Richard III (1995)&#34; | &#34;technician&#34; |
+--------------------------------------------------------+--------------+
showing top 10 rows
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+---------+----------+--------+
| user\_id | movie\_id | rating |
+---------+----------+--------+
| int32 | int32 | int32 |
+---------+----------+--------+
| 1 | 1 | 5 |
| 1 | 2 | 3 |
| 1 | 3 | 4 |
| 1 | 4 | 3 |
| 1 | 5 | 3 |
| 1 | 6 | 5 |
| 1 | 7 | 4 |
| 1 | 8 | 1 |
| 1 | 9 | 5 |
| 1 | 10 | 3 |
+---------+----------+--------+
+--------------------------------------------------------+--------------+
| movie | occupation |
+--------------------------------------------------------+--------------+
| str | str |
+--------------------------------------------------------+--------------+
| "Toy Story (1995)" | "technician" |
| "GoldenEye (1995)" | "technician" |
| "Four Rooms (1995)" | "technician" |
| "Get Shorty (1995)" | "technician" |
| "Copycat (1995)" | "technician" |
| "Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)" | "technician" |
| "Twelve Monkeys (1995)" | "technician" |
| "Babe (1995)" | "technician" |
| "Dead Man Walking (1995)" | "technician" |
| "Richard III (1995)" | "technician" |
+--------------------------------------------------------+--------------+
showing top 10 rows
\end{sphinxVerbatim}
.. raw:: text
+---------+----------+--------+
| user_id | movie_id | rating |
+---------+----------+--------+
| int32 | int32 | int32 |
+---------+----------+--------+
| 1 | 1 | 5 |
| 1 | 2 | 3 |
| 1 | 3 | 4 |
| 1 | 4 | 3 |
| 1 | 5 | 3 |
| 1 | 6 | 5 |
| 1 | 7 | 4 |
| 1 | 8 | 1 |
| 1 | 9 | 5 |
| 1 | 10 | 3 |
+---------+----------+--------+
+--------------------------------------------------------+--------------+
| movie | occupation |
+--------------------------------------------------------+--------------+
| str | str |
+--------------------------------------------------------+--------------+
| "Toy Story (1995)" | "technician" |
| "GoldenEye (1995)" | "technician" |
| "Four Rooms (1995)" | "technician" |
| "Get Shorty (1995)" | "technician" |
| "Copycat (1995)" | "technician" |
| "Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)" | "technician" |
| "Twelve Monkeys (1995)" | "technician" |
| "Babe (1995)" | "technician" |
| "Dead Man Walking (1995)" | "technician" |
| "Richard III (1995)" | "technician" |
+--------------------------------------------------------+--------------+
showing top 10 rows
..
Next, we'll use ``group_by`` along with the aggregator ``hl.agg.mean`` to determine the average rating of each movie by occupation. Remember that the ``group_by`` operation is always associated with an aggregation.
..
.. nbinput:: ipython3
:execution-count: 17
ratings_by_job = movie_data.group_by(
movie_data.occupation, movie_data.movie).aggregate(
mean = hl.agg.mean(movie_data.rating))
ratings_by_job.show()
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 41:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 41:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 41:> (0 + 1) / 1]
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">occupation</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">mean</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;101 Dalmatians (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.75e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;12 Angry Men (1957)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4.56e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;187 (1997)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;2 Days in the Valley (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.25e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;20,000 Leagues Under the Sea (1954)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.63e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;2001: A Space Odyssey (1968)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4.04e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;39 Steps, The (1935)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">3.80e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;8 1/2 (1963)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;8 Seconds (1994)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">4.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;A Chef in Love (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
</tbody></table><p style="background: #fdd; padding: 0.4em;">showing top 10 rows</p>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+-----------------+---------------------------------------+----------+
| occupation | movie | mean |
+-----------------+---------------------------------------+----------+
| str | str | float64 |
+-----------------+---------------------------------------+----------+
| &#34;administrator&#34; | &#34;101 Dalmatians (1996)&#34; | 2.75e+00 |
| &#34;administrator&#34; | &#34;12 Angry Men (1957)&#34; | 4.56e+00 |
| &#34;administrator&#34; | &#34;187 (1997)&#34; | 2.00e+00 |
| &#34;administrator&#34; | &#34;2 Days in the Valley (1996)&#34; | 3.25e+00 |
| &#34;administrator&#34; | &#34;20,000 Leagues Under the Sea (1954)&#34; | 3.63e+00 |
| &#34;administrator&#34; | &#34;2001: A Space Odyssey (1968)&#34; | 4.04e+00 |
| &#34;administrator&#34; | &#34;39 Steps, The (1935)&#34; | 3.80e+00 |
| &#34;administrator&#34; | &#34;8 1/2 (1963)&#34; | 4.00e+00 |
| &#34;administrator&#34; | &#34;8 Seconds (1994)&#34; | 4.00e+00 |
| &#34;administrator&#34; | &#34;A Chef in Love (1996)&#34; | 5.00e+00 |
+-----------------+---------------------------------------+----------+
showing top 10 rows
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+-----------------+---------------------------------------+----------+
| occupation | movie | mean |
+-----------------+---------------------------------------+----------+
| str | str | float64 |
+-----------------+---------------------------------------+----------+
| "administrator" | "101 Dalmatians (1996)" | 2.75e+00 |
| "administrator" | "12 Angry Men (1957)" | 4.56e+00 |
| "administrator" | "187 (1997)" | 2.00e+00 |
| "administrator" | "2 Days in the Valley (1996)" | 3.25e+00 |
| "administrator" | "20,000 Leagues Under the Sea (1954)" | 3.63e+00 |
| "administrator" | "2001: A Space Odyssey (1968)" | 4.04e+00 |
| "administrator" | "39 Steps, The (1935)" | 3.80e+00 |
| "administrator" | "8 1/2 (1963)" | 4.00e+00 |
| "administrator" | "8 Seconds (1994)" | 4.00e+00 |
| "administrator" | "A Chef in Love (1996)" | 5.00e+00 |
+-----------------+---------------------------------------+----------+
showing top 10 rows
\end{sphinxVerbatim}
.. raw:: text
+-----------------+---------------------------------------+----------+
| occupation | movie | mean |
+-----------------+---------------------------------------+----------+
| str | str | float64 |
+-----------------+---------------------------------------+----------+
| "administrator" | "101 Dalmatians (1996)" | 2.75e+00 |
| "administrator" | "12 Angry Men (1957)" | 4.56e+00 |
| "administrator" | "187 (1997)" | 2.00e+00 |
| "administrator" | "2 Days in the Valley (1996)" | 3.25e+00 |
| "administrator" | "20,000 Leagues Under the Sea (1954)" | 3.63e+00 |
| "administrator" | "2001: A Space Odyssey (1968)" | 4.04e+00 |
| "administrator" | "39 Steps, The (1935)" | 3.80e+00 |
| "administrator" | "8 1/2 (1963)" | 4.00e+00 |
| "administrator" | "8 Seconds (1994)" | 4.00e+00 |
| "administrator" | "A Chef in Love (1996)" | 5.00e+00 |
+-----------------+---------------------------------------+----------+
showing top 10 rows
..
.. nbinput:: ipython3
:no-output:
..
Now we can use another ``group_by`` to determine the highest rated movie, on average, for each occupation.
The syntax here needs some explaining. The second step in the cell below is just to clean up the table created by the preceding step. If you examine the intermediate result (for example, by giving a new name to the output of the first step), you will see that there are two columns corresponding to occupation, ``occupation`` and ``val.occupation``. This is an artifact of the aggregator syntax and the fact that we are retaining the entire row from ``ratings_by_job``. So in the second line, we use
``select`` to keep those columns that we want, and also rename them to drop the ``val.`` syntax. Since ``occupation`` is a key of this table, we don't need to select for it.
..
.. nbinput:: ipython3
:execution-count: 18
highest_rated = ratings_by_job.group_by(
ratings_by_job.occupation).aggregate(
val = hl.agg.take(ratings_by_job.row,1, ordering = -ratings_by_job.mean)[0]
)
highest_rated = highest_rated.select(movie = highest_rated.val.movie,
mean = highest_rated.val.mean)
highest_rated.show()
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 55:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 55:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 55:> (0 + 1) / 1]
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">occupation</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">mean</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">float64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;A Chef in Love (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;artist&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;39 Steps, The (1935)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;doctor&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Alien (1979)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;educator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Aparajito (1956)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;engineer&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Charade (1963)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;entertainment&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;American in Paris, An (1951)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;executive&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;A Chef in Love (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;healthcare&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;39 Steps, The (1935)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;homemaker&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Beautiful Girls (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;lawyer&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Anastasia (1997)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">5.00e+00</td></tr>
</tbody></table><p style="background: #fdd; padding: 0.4em;">showing top 10 rows</p>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+-----------------+--------------------------------+----------+
| occupation | movie | mean |
+-----------------+--------------------------------+----------+
| str | str | float64 |
+-----------------+--------------------------------+----------+
| &#34;administrator&#34; | &#34;A Chef in Love (1996)&#34; | 5.00e+00 |
| &#34;artist&#34; | &#34;39 Steps, The (1935)&#34; | 5.00e+00 |
| &#34;doctor&#34; | &#34;Alien (1979)&#34; | 5.00e+00 |
| &#34;educator&#34; | &#34;Aparajito (1956)&#34; | 5.00e+00 |
| &#34;engineer&#34; | &#34;Charade (1963)&#34; | 5.00e+00 |
| &#34;entertainment&#34; | &#34;American in Paris, An (1951)&#34; | 5.00e+00 |
| &#34;executive&#34; | &#34;A Chef in Love (1996)&#34; | 5.00e+00 |
| &#34;healthcare&#34; | &#34;39 Steps, The (1935)&#34; | 5.00e+00 |
| &#34;homemaker&#34; | &#34;Beautiful Girls (1996)&#34; | 5.00e+00 |
| &#34;lawyer&#34; | &#34;Anastasia (1997)&#34; | 5.00e+00 |
+-----------------+--------------------------------+----------+
showing top 10 rows
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+-----------------+--------------------------------+----------+
| occupation | movie | mean |
+-----------------+--------------------------------+----------+
| str | str | float64 |
+-----------------+--------------------------------+----------+
| "administrator" | "A Chef in Love (1996)" | 5.00e+00 |
| "artist" | "39 Steps, The (1935)" | 5.00e+00 |
| "doctor" | "Alien (1979)" | 5.00e+00 |
| "educator" | "Aparajito (1956)" | 5.00e+00 |
| "engineer" | "Charade (1963)" | 5.00e+00 |
| "entertainment" | "American in Paris, An (1951)" | 5.00e+00 |
| "executive" | "A Chef in Love (1996)" | 5.00e+00 |
| "healthcare" | "39 Steps, The (1935)" | 5.00e+00 |
| "homemaker" | "Beautiful Girls (1996)" | 5.00e+00 |
| "lawyer" | "Anastasia (1997)" | 5.00e+00 |
+-----------------+--------------------------------+----------+
showing top 10 rows
\end{sphinxVerbatim}
.. raw:: text
+-----------------+--------------------------------+----------+
| occupation | movie | mean |
+-----------------+--------------------------------+----------+
| str | str | float64 |
+-----------------+--------------------------------+----------+
| "administrator" | "A Chef in Love (1996)" | 5.00e+00 |
| "artist" | "39 Steps, The (1935)" | 5.00e+00 |
| "doctor" | "Alien (1979)" | 5.00e+00 |
| "educator" | "Aparajito (1956)" | 5.00e+00 |
| "engineer" | "Charade (1963)" | 5.00e+00 |
| "entertainment" | "American in Paris, An (1951)" | 5.00e+00 |
| "executive" | "A Chef in Love (1996)" | 5.00e+00 |
| "healthcare" | "39 Steps, The (1935)" | 5.00e+00 |
| "homemaker" | "Beautiful Girls (1996)" | 5.00e+00 |
| "lawyer" | "Anastasia (1997)" | 5.00e+00 |
+-----------------+--------------------------------+----------+
showing top 10 rows
..
Let's try to get a deeper understanding of this result. Notice that every movie displayed has an *average* rating of 5, which means that every person gave these movies the highest rating. Is that unlikely? We can determine how many people rated each of these movies by working backwards and filtering our original ``movie_data`` table by fields in ``highest_rated``.
Note that in the second line below, we are taking advantage of the fact that Hail tables are keyed.
..
.. nbinput:: ipython3
:execution-count: 19
highest_rated = highest_rated.key_by(
highest_rated.occupation, highest_rated.movie)
counts_temp = movie_data.filter(
hl.is_defined(highest_rated[movie_data.occupation, movie_data.movie]))
counts = counts_temp.group_by(counts_temp.occupation, counts_temp.movie).aggregate(
counts = hl.agg.count())
counts.show()
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 79:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 79:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 79:> (0 + 1) / 1]
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 94:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 94:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 94:> (0 + 1) / 1]
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 101:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 101:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 101:> (0 + 1) / 1]
..
.. nboutput::
:more-to-come:
:class: stderr
.. rst-class:: highlight
.. raw:: html
<pre>
[Stage 108:&gt; (0 + 1) / 1]
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
[Stage 108:> (0 + 1) / 1]
\end{sphinxVerbatim}
.. raw:: text
[Stage 108:> (0 + 1) / 1]
..
.. only:: html
.. nboutput::
:fancy:
:class: rendered_html
.. raw:: html
<table><thead><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;"></div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">occupation</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">movie</div></td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; " colspan="1"><div style="text-align: left;border-bottom: solid 2px #000; padding-bottom: 5px">counts</div></td></tr><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">str</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; text-align: left;">int64</td></tr>
</thead><tbody><tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;administrator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;A Chef in Love (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;artist&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;39 Steps, The (1935)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;doctor&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Alien (1979)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;educator&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Aparajito (1956)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">2</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;engineer&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Charade (1963)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;entertainment&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;American in Paris, An (1951)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;executive&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;A Chef in Love (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;healthcare&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;39 Steps, The (1935)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;homemaker&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Beautiful Girls (1996)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
<tr><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;lawyer&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">&quot;Anastasia (1997)&quot;</td><td style="white-space: nowrap; max-width: 500px; overflow: hidden; text-overflow: ellipsis; ">1</td></tr>
</tbody></table><p style="background: #fdd; padding: 0.4em;">showing top 10 rows</p>
.. only:: latex
.. nboutput::
.. rst-class:: highlight
.. raw:: html
<pre>
+-----------------+--------------------------------+--------+
| occupation | movie | counts |
+-----------------+--------------------------------+--------+
| str | str | int64 |
+-----------------+--------------------------------+--------+
| &#34;administrator&#34; | &#34;A Chef in Love (1996)&#34; | 1 |
| &#34;artist&#34; | &#34;39 Steps, The (1935)&#34; | 1 |
| &#34;doctor&#34; | &#34;Alien (1979)&#34; | 1 |
| &#34;educator&#34; | &#34;Aparajito (1956)&#34; | 2 |
| &#34;engineer&#34; | &#34;Charade (1963)&#34; | 1 |
| &#34;entertainment&#34; | &#34;American in Paris, An (1951)&#34; | 1 |
| &#34;executive&#34; | &#34;A Chef in Love (1996)&#34; | 1 |
| &#34;healthcare&#34; | &#34;39 Steps, The (1935)&#34; | 1 |
| &#34;homemaker&#34; | &#34;Beautiful Girls (1996)&#34; | 1 |
| &#34;lawyer&#34; | &#34;Anastasia (1997)&#34; | 1 |
+-----------------+--------------------------------+--------+
showing top 10 rows
</pre>
.. raw:: latex
\begin{sphinxVerbatim}[commandchars=\\\{\}]
+-----------------+--------------------------------+--------+
| occupation | movie | counts |
+-----------------+--------------------------------+--------+
| str | str | int64 |
+-----------------+--------------------------------+--------+
| "administrator" | "A Chef in Love (1996)" | 1 |
| "artist" | "39 Steps, The (1935)" | 1 |
| "doctor" | "Alien (1979)" | 1 |
| "educator" | "Aparajito (1956)" | 2 |
| "engineer" | "Charade (1963)" | 1 |
| "entertainment" | "American in Paris, An (1951)" | 1 |
| "executive" | "A Chef in Love (1996)" | 1 |
| "healthcare" | "39 Steps, The (1935)" | 1 |
| "homemaker" | "Beautiful Girls (1996)" | 1 |
| "lawyer" | "Anastasia (1997)" | 1 |
+-----------------+--------------------------------+--------+
showing top 10 rows
\end{sphinxVerbatim}
.. raw:: text
+-----------------+--------------------------------+--------+
| occupation | movie | counts |
+-----------------+--------------------------------+--------+
| str | str | int64 |
+-----------------+--------------------------------+--------+
| "administrator" | "A Chef in Love (1996)" | 1 |
| "artist" | "39 Steps, The (1935)" | 1 |
| "doctor" | "Alien (1979)" | 1 |
| "educator" | "Aparajito (1956)" | 2 |
| "engineer" | "Charade (1963)" | 1 |
| "entertainment" | "American in Paris, An (1951)" | 1 |
| "executive" | "A Chef in Love (1996)" | 1 |
| "healthcare" | "39 Steps, The (1935)" | 1 |
| "homemaker" | "Beautiful Girls (1996)" | 1 |
| "lawyer" | "Anastasia (1997)" | 1 |
+-----------------+--------------------------------+--------+
showing top 10 rows
..
So it looks like the highest rated movies, when computed naively, mostly have a single viewer rating them. To get a better understanding of the data, we can recompute this list but only include movies which have more than 1 viewer (left as an exercise).
..
Exercises
~~~~~~~~~
- What is the favorite movie for each occupation, conditional on there being more than one viewer?
- What genres are rated most differently by men and women?
..
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment