mlin/census_dataset_presence.html

## census_dataset_presence.html
<!DOCTYPE html>

<html>

<head>

<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />

<meta name="viewport" content="width=device-width, initial-scale=1" />


<title>Census datasets presence</title>

<script>// Pandoc 2.9 adds attributes on both header and div. We remove the former (to
// be compatible with the behavior of Pandoc < 2.8).
document.addEventListener('DOMContentLoaded', function(e) {
  var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
  var i, h, a;
  for (i = 0; i < hs.length; i++) {
    h = hs[i];
    if (!/^h[1-6]$/i.test(h.tagName)) continue;  // it should be a header h1-h6
    a = h.attributes;
    while (a.length > 0) h.removeAttribute(a[0].name);
  }
});
</script>

<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>


<style type="text/css">
code {
white-space: pre;
}
.sourceCode {
overflow: visible;
}
</style>
<style type="text/css" data-origin="pandoc">
pre > code.sourceCode { white-space: pre; position: relative; }
pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
pre > code.sourceCode > span:empty { height: 1.2em; }
.sourceCode { overflow: visible; }
code.sourceCode > span { color: inherit; text-decoration: inherit; }
div.sourceCode { margin: 1em 0; }
pre.sourceCode { margin: 0; }
@media screen {
div.sourceCode { overflow: auto; }
}
@media print {
pre > code.sourceCode { white-space: pre-wrap; }
pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
}
pre.numberSource code
{ counter-reset: source-line 0; }
pre.numberSource code > span
{ position: relative; left: -4em; counter-increment: source-line; }
pre.numberSource code > span > a:first-child::before
{ content: counter(source-line);
position: relative; left: -1em; text-align: right; vertical-align: baseline;
border: none; display: inline-block;
-webkit-touch-callout: none; -webkit-user-select: none;
-khtml-user-select: none; -moz-user-select: none;
-ms-user-select: none; user-select: none;
padding: 0 4px; width: 4em;
color: #aaaaaa;
}
pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; }
div.sourceCode
{ }
@media screen {
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
}
code span.al { color: #ff0000; font-weight: bold; }
code span.an { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.at { color: #7d9029; }
code span.bn { color: #40a070; }
code span.bu { color: #008000; }
code span.cf { color: #007020; font-weight: bold; }
code span.ch { color: #4070a0; }
code span.cn { color: #880000; }
code span.co { color: #60a0b0; font-style: italic; }
code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.do { color: #ba2121; font-style: italic; }
code span.dt { color: #902000; }
code span.dv { color: #40a070; }
code span.er { color: #ff0000; font-weight: bold; }
code span.ex { }
code span.fl { color: #40a070; }
code span.fu { color: #06287e; }
code span.im { color: #008000; font-weight: bold; }
code span.in { color: #60a0b0; font-weight: bold; font-style: italic; }
code span.kw { color: #007020; font-weight: bold; }
code span.op { color: #666666; }
code span.ot { color: #007020; }
code span.pp { color: #bc7a00; }
code span.sc { color: #4070a0; }
code span.ss { color: #bb6688; }
code span.st { color: #4070a0; }
code span.va { color: #19177c; }
code span.vs { color: #4070a0; }
code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; }
</style>
<script>
// apply pandoc div.sourceCode style to pre.sourceCode instead
(function() {
  var sheets = document.styleSheets;
  for (var i = 0; i < sheets.length; i++) {
    if (sheets[i].ownerNode.dataset["origin"] !== "pandoc") continue;
    try { var rules = sheets[i].cssRules; } catch (e) { continue; }
    var j = 0;
    while (j < rules.length) {
      var rule = rules[j];
      // check if there is a div.sourceCode rule
      if (rule.type !== rule.STYLE_RULE || rule.selectorText !== "div.sourceCode") {
        j++;
        continue;
      }
      var style = rule.style.cssText;
      // check if color or background-color is set
      if (rule.style.color === '' && rule.style.backgroundColor === '') {
        j++;
        continue;
      }
      // replace div.sourceCode by a pre.sourceCode rule
      sheets[i].deleteRule(j);
      sheets[i].insertRule('pre.sourceCode{' + style + '}', j);
    }
  }
})();
</script>


<style type="text/css">body {
background-color: #fff;
margin: 1em auto;
max-width: 700px;
overflow: visible;
padding-left: 2em;
padding-right: 2em;
font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
font-size: 14px;
line-height: 1.35;
}
#TOC {
clear: both;
margin: 0 0 10px 10px;
padding: 4px;
width: 400px;
border: 1px solid #CCCCCC;
border-radius: 5px;
background-color: #f6f6f6;
font-size: 13px;
line-height: 1.3;
}
#TOC .toctitle {
font-weight: bold;
font-size: 15px;
margin-left: 5px;
}
#TOC ul {
padding-left: 40px;
margin-left: -1.5em;
margin-top: 5px;
margin-bottom: 5px;
}
#TOC ul ul {
margin-left: -2em;
}
#TOC li {
line-height: 16px;
}
table {
margin: 1em auto;
border-width: 1px;
border-color: #DDDDDD;
border-style: outset;
border-collapse: collapse;
}
table th {
border-width: 2px;
padding: 5px;
border-style: inset;
}
table td {
border-width: 1px;
border-style: inset;
line-height: 18px;
padding: 5px 5px;
}
table, table th, table td {
border-left-style: none;
border-right-style: none;
}
table thead, table tr.even {
background-color: #f7f7f7;
}
p {
margin: 0.5em 0;
}
blockquote {
background-color: #f6f6f6;
padding: 0.25em 0.75em;
}
hr {
border-style: solid;
border: none;
border-top: 1px solid #777;
margin: 28px 0;
}
dl {
margin-left: 0;
}
dl dd {
margin-bottom: 13px;
margin-left: 13px;
}
dl dt {
font-weight: bold;
}
ul {
margin-top: 0;
}
ul li {
list-style: circle outside;
}
ul ul {
margin-bottom: 0;
}
pre, code {
background-color: #f7f7f7;
border-radius: 3px;
color: #333;
white-space: pre-wrap;
}
pre {
border-radius: 3px;
margin: 5px 0px 10px 0px;
padding: 10px;
}
pre:not([class]) {
background-color: #f7f7f7;
}
code {
font-family: Consolas, Monaco, 'Courier New', monospace;
font-size: 85%;
}
p > code, li > code {
padding: 2px 0px;
}
div.figure {
text-align: center;
}
img {
background-color: #FFFFFF;
padding: 2px;
border: 1px solid #DDDDDD;
border-radius: 3px;
border: 1px solid #CCCCCC;
margin: 0 5px;
}
h1 {
margin-top: 0;
font-size: 35px;
line-height: 40px;
}
h2 {
border-bottom: 4px solid #f7f7f7;
padding-top: 10px;
padding-bottom: 2px;
font-size: 145%;
}
h3 {
border-bottom: 2px solid #f7f7f7;
padding-top: 10px;
font-size: 120%;
}
h4 {
border-bottom: 1px solid #f7f7f7;
margin-left: 8px;
font-size: 105%;
}
h5, h6 {
border-bottom: 1px solid #ccc;
font-size: 105%;
}
a {
color: #0033dd;
text-decoration: none;
}
a:hover {
color: #6666ff; }
a:visited {
color: #800080; }
a:visited:hover {
color: #BB00BB; }
a[href^="http:"] {
text-decoration: underline; }
a[href^="https:"] {
text-decoration: underline; }

code > span.kw { color: #555; font-weight: bold; }
code > span.dt { color: #902000; }
code > span.dv { color: #40a070; }
code > span.bn { color: #d14; }
code > span.fl { color: #d14; }
code > span.ch { color: #d14; }
code > span.st { color: #d14; }
code > span.co { color: #888888; font-style: italic; }
code > span.ot { color: #007020; }
code > span.al { color: #ff0000; font-weight: bold; }
code > span.fu { color: #900; font-weight: bold; }
code > span.er { color: #a61717; background-color: #e3d2d2; }
</style>


</head>

<body>


<h1 class="title toc-ignore">Census datasets presence</h1>


<!--
THIS VIGNETTE IS BASED ON:
https://github.com/chanzuckerberg/cellxgene-census/blob/main/api/python/notebooks/api_demo/census_dataset_presence.ipynb
-->
<p><em>Goal:</em> demonstrate basic use of the
<code>datasets_presence_matrix</code> array.</p>
<p>The presence matrix is a sparse array, indicating which features
(var) were present in each dataset. The array has dimensions
[n_datasets, n_var], and is stored in the SOMA Measurement
<code>varp</code> collection. The first dimension is indexed by the
<code>soma_joinid</code> in the <code>census_datasets</code> dataframe.
The second is indexed by the <code>soma_joinid</code> in the
<code>var</code> dataframe of the measurement.</p>
<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>census <span class="ot">&lt;-</span> cellxgene.census<span class="sc">::</span><span class="fu">open_soma</span>()</span>
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Grab the experiment containing human data, and the measurement therein with RNA</span></span>
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>human <span class="ot">&lt;-</span> census<span class="sc">$</span><span class="fu">get</span>(<span class="st">&quot;census_data&quot;</span>)<span class="sc">$</span><span class="fu">get</span>(<span class="st">&quot;homo_sapiens&quot;</span>)</span>
<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>human_rna <span class="ot">&lt;-</span> human<span class="sc">$</span>ms<span class="sc">$</span><span class="fu">get</span>(<span class="st">&quot;RNA&quot;</span>)</span>
<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a><span class="co"># The census-wide datasets</span></span>
<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>datasets_df <span class="ot">&lt;-</span> <span class="fu">as.data.frame</span>(census<span class="sc">$</span><span class="fu">get</span>(<span class="st">&quot;census_info&quot;</span>)<span class="sc">$</span><span class="fu">get</span>(<span class="st">&quot;datasets&quot;</span>)<span class="sc">$</span><span class="fu">read</span>())</span>
<span id="cb1-8"><a href="#cb1-8" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(datasets_df)</span>
<span id="cb1-9"><a href="#cb1-9" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # A tibble: 522 × 8</span></span>
<span id="cb1-10"><a href="#cb1-10" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;    soma_joinid collection_id           collection_name collection_doi dataset_id</span></span>
<span id="cb1-11"><a href="#cb1-11" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;          &lt;int&gt; &lt;chr&gt;                   &lt;chr&gt;           &lt;chr&gt;          &lt;chr&gt;     </span></span>
<span id="cb1-12"><a href="#cb1-12" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  1           0 43d4bb39-21af-4d05-b97… Transcriptiona… 10.1016/j.cel… f512b8b6-…</span></span>
<span id="cb1-13"><a href="#cb1-13" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  2           1 d36ca85c-3e8b-444c-ba3… A molecular at… 10.1101/2022.… 90d4a63b-…</span></span>
<span id="cb1-14"><a href="#cb1-14" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  3           2 d36ca85c-3e8b-444c-ba3… A molecular at… 10.1101/2022.… d1207c81-…</span></span>
<span id="cb1-15"><a href="#cb1-15" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  4           3 2b02dff7-e427-4cdc-96f… Single-Cell An… 10.1016/j.cel… 36c867a7-…</span></span>
<span id="cb1-16"><a href="#cb1-16" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  5           4 e9eec7f5-8519-42f6-99b… Humoral immuni… 10.1016/j.coi… 58b01044-…</span></span>
<span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  6           5 a72afd53-ab92-4511-88d… Single-cell at… 10.1038/s4159… 456e8b9b-…</span></span>
<span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  7           6 e4c9ed14-e560-4900-a3b… A molecular si… 10.1038/s4158… d8da613f-…</span></span>
<span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  8           7 4796c91c-9d8f-4692-be4… MSK SPECTRUM –… 10.1038/s4158… 97d9238c-…</span></span>
<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  9           8 4796c91c-9d8f-4692-be4… MSK SPECTRUM –… 10.1038/s4158… e3a7e927-…</span></span>
<span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; 10           9 4796c91c-9d8f-4692-be4… MSK SPECTRUM –… 10.1038/s4158… 0caedec7-…</span></span>
<span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # ℹ 512 more rows</span></span>
<span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # ℹ 3 more variables: dataset_title &lt;chr&gt;, dataset_h5ad_path &lt;chr&gt;,</span></span>
<span id="cb1-24"><a href="#cb1-24" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; #   dataset_total_cell_count &lt;int&gt;</span></span></code></pre></div>
<p>For convenience, read the entire presence matrix (for Homo sapiens)
into a <code>Matrix::sparseMatrix</code>. There is a convenience API
providing this capability:</p>
<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>presence_matrix <span class="ot">&lt;-</span> cellxgene.census<span class="sc">::</span><span class="fu">get_presence_matrix</span>(census, <span class="st">&quot;Homo sapiens&quot;</span>, <span class="st">&quot;RNA&quot;</span>)</span>
<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(<span class="fu">dim</span>(presence_matrix))</span>
<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; [1]   522 60664</span></span></code></pre></div>
<p>We also need the <code>var</code> dataframe, which is read into an R
data frame for convenient manipulation:</p>
<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>var_df <span class="ot">&lt;-</span> <span class="fu">as.data.frame</span>(human_rna<span class="sc">$</span>var<span class="sc">$</span><span class="fu">read</span>())</span>
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(var_df)</span>
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # A tibble: 60,664 × 4</span></span>
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;    soma_joinid feature_id      feature_name   feature_length</span></span>
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;          &lt;int&gt; &lt;chr&gt;           &lt;chr&gt;                   &lt;int&gt;</span></span>
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  1           0 ENSG00000238009 RP11-34P13.7             3726</span></span>
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  2           1 ENSG00000279457 WASH9P                   1397</span></span>
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  3           2 ENSG00000228463 AP006222.1               8224</span></span>
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  4           3 ENSG00000237094 RP4-669L17.4             6204</span></span>
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  5           4 ENSG00000230021 RP11-206L10.17           5495</span></span>
<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  6           5 ENSG00000237491 LINC01409                8413</span></span>
<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  7           6 ENSG00000177757 FAM87B                   1947</span></span>
<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  8           7 ENSG00000225880 LINC00115                1317</span></span>
<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  9           8 ENSG00000230368 FAM41C                   1971</span></span>
<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; 10           9 ENSG00000230699 RP11-54O7.1              3043</span></span>
<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # ℹ 60,654 more rows</span></span></code></pre></div>
<div id="is-a-feature-present-in-a-dataset" class="section level2">
<h2>Is a feature present in a dataset?</h2>
<p><em>Goal:</em> test if a given feature is present in a given
dataset.</p>
<p><strong>Important:</strong> the (one-based) indexes in the sparse
presence matrix correspond to the (zero-based) <code>soma_joinid</code>
+ 1. In other words:</p>
<ul>
<li>the first dimension of the presence matrix is (one plus) the
dataset’s <code>soma_joinid</code> as stored in the
<code>census_datasets</code> dataframe.</li>
<li>the second dimension of the presence matrix is (one plus) the
feature’s <code>soma_joinid</code> as stored in the <code>var</code>
dataframe.</li>
</ul>
<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>var_joinid <span class="ot">&lt;-</span> var_df<span class="sc">$</span>soma_joinid[var_df<span class="sc">$</span>feature_id <span class="sc">==</span> <span class="st">&quot;ENSG00000286096&quot;</span>]</span>
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>dataset_joinid <span class="ot">&lt;-</span> datasets_df<span class="sc">$</span>soma_joinid[datasets_df<span class="sc">$</span>dataset_id <span class="sc">==</span> <span class="st">&quot;97a17473-e2b1-4f31-a544-44a60773e2dd&quot;</span>]</span>
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>is_present <span class="ot">&lt;-</span> presence_matrix[dataset_joinid <span class="sc">+</span> <span class="dv">1</span>, var_joinid <span class="sc">+</span> <span class="dv">1</span>]</span>
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="fu">cat</span>(<span class="fu">paste</span>(<span class="st">&quot;Feature is&quot;</span>, <span class="cf">if</span> (is_present) <span class="st">&quot;present.&quot;</span> <span class="cf">else</span> <span class="st">&quot;not present.&quot;</span>))</span>
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; Feature is present.</span></span></code></pre></div>
</div>
<div id="what-datasets-contain-a-feature" class="section level2">
<h2>What datasets contain a feature?</h2>
<p><em>Goal:</em> look up all datasets that have a feature_id
present.</p>
<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Grab the feature&#39;s soma_joinid from the var dataframe</span></span>
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>var_joinid <span class="ot">&lt;-</span> var_df<span class="sc">$</span>soma_joinid[var_df<span class="sc">$</span>feature_id <span class="sc">==</span> <span class="st">&quot;ENSG00000286096&quot;</span>]</span>
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="co"># The presence matrix is indexed by the joinids of the dataset and var dataframes,</span></span>
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="co"># so slice out the feature of interest by its joinid.</span></span>
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>dataset_joinids <span class="ot">&lt;-</span> datasets_df<span class="sc">$</span>soma_joinid[presence_matrix[, var_joinid <span class="sc">+</span> <span class="dv">1</span>] <span class="sc">!=</span> <span class="dv">0</span>]</span>
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(datasets_df[dataset_joinids <span class="sc">+</span> <span class="dv">1</span>, ])</span>
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # A tibble: 24 × 8</span></span>
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;    soma_joinid collection_id           collection_name collection_doi dataset_id</span></span>
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;          &lt;int&gt; &lt;chr&gt;                   &lt;chr&gt;           &lt;chr&gt;          &lt;chr&gt;     </span></span>
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  1          89 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 07b1d7c8-…</span></span>
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  2         102 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 7c1c3d47-…</span></span>
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  3         103 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 9372df2d-…</span></span>
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  4         131 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… dd03ce70-…</span></span>
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  5         145 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 7a0a8891-…</span></span>
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  6         147 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… d2b5efc1-…</span></span>
<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  7         151 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… f8dda921-…</span></span>
<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  8         154 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 3a7f3ab4-…</span></span>
<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  9         156 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… bdb26abd-…</span></span>
<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; 10         158 283d65eb-dd53-496d-adb… Transcriptomic… 10.1101/2022.… 5e5ab909-…</span></span>
<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # ℹ 14 more rows</span></span>
<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # ℹ 3 more variables: dataset_title &lt;chr&gt;, dataset_h5ad_path &lt;chr&gt;,</span></span>
<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; #   dataset_total_cell_count &lt;int&gt;</span></span></code></pre></div>
</div>
<div id="what-features-are-in-a-dataset" class="section level2">
<h2>What features are in a dataset?</h2>
<p><em>Goal:</em> lookup the features present in a given dataset.</p>
<p>This example also demonstrates the ability to do the query on
multiple datasets.</p>
<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Slice the dataset(s) of interest, and get the joinid(s)</span></span>
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>dataset_joinids <span class="ot">&lt;-</span> datasets_df<span class="sc">$</span>soma_joinid[datasets_df<span class="sc">$</span>collection_id <span class="sc">==</span> <span class="st">&quot;17481d16-ee44-49e5-bcf0-28c0780d8c4a&quot;</span>]</span>
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Slice the presence matrix by the first dimension, i.e., by dataset</span></span>
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>var_joinids <span class="ot">&lt;-</span> var_df<span class="sc">$</span>soma_joinid[<span class="fu">which</span>(Matrix<span class="sc">::</span><span class="fu">colSums</span>(presence_matrix[dataset_joinids <span class="sc">+</span> <span class="dv">1</span>, ]) <span class="sc">&gt;</span> <span class="dv">0</span>)]</span>
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a></span>
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a><span class="fu">print</span>(var_df[var_joinids <span class="sc">+</span> <span class="dv">1</span>, ])</span>
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # A tibble: 27,211 × 4</span></span>
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;    soma_joinid feature_id      feature_name   feature_length</span></span>
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;          &lt;int&gt; &lt;chr&gt;           &lt;chr&gt;                   &lt;int&gt;</span></span>
<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  1           0 ENSG00000238009 RP11-34P13.7             3726</span></span>
<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  2           1 ENSG00000279457 WASH9P                   1397</span></span>
<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  3           2 ENSG00000228463 AP006222.1               8224</span></span>
<span id="cb6-14"><a href="#cb6-14" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  4           3 ENSG00000237094 RP4-669L17.4             6204</span></span>
<span id="cb6-15"><a href="#cb6-15" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  5           4 ENSG00000230021 RP11-206L10.17           5495</span></span>
<span id="cb6-16"><a href="#cb6-16" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  6           5 ENSG00000237491 LINC01409                8413</span></span>
<span id="cb6-17"><a href="#cb6-17" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  7           6 ENSG00000177757 FAM87B                   1947</span></span>
<span id="cb6-18"><a href="#cb6-18" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  8           7 ENSG00000225880 LINC00115                1317</span></span>
<span id="cb6-19"><a href="#cb6-19" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt;  9           8 ENSG00000230368 FAM41C                   1971</span></span>
<span id="cb6-20"><a href="#cb6-20" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; 10           9 ENSG00000230699 RP11-54O7.1              3043</span></span>
<span id="cb6-21"><a href="#cb6-21" aria-hidden="true" tabindex="-1"></a><span class="co">#&gt; # ℹ 27,201 more rows</span></span></code></pre></div>
</div>


<!-- code folding -->


<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
  (function () {
    var script = document.createElement("script");
    script.type = "text/javascript";
    script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
    document.getElementsByTagName("head")[0].appendChild(script);
  })();
</script>

</body>
</html>