andreagrioni/query_biomart.Rmd

## query_biomart.Rmd
#### BiomaRt Biotype Analysis

See BioMartR documentation [here](https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html)

Load ensembl db annotation.

```{r}
# load ensembl
ensembl <- useEnsembl(
  biomart = "genes",
  dataset = "hsapiens_gene_ensembl"
  )
```

retrieve annotation for select proteins.

```{r}
mapping <- getBM(
  attributes = c(
    'ensembl_gene_id',
    #'ensembl_transcript_id',
    'hgnc_symbol',
    "go_id", "namespace_1003", "name_1006"),
  filters = 'hgnc_symbol',
  values = my_annotation$EntrezGeneSymbol %>% unique,
  mart = ensembl
)
# map
mapping
```

Sort Biomart Annotation and filter for the three most common terms.

```{r}
# sort and filter
mapping %>%
  # look for biological processes only
  dplyr::filter(namespace_1003 == "biological_process") %>%
  dplyr::group_by(name_1006) %>%
  dplyr::tally() %>%
  dplyr::arrange(dplyr::desc(n)) %>%
  dplyr::slice_head(n=40) %>%
  dplyr::pull(name_1006) -> my_terms
# print out
glue::glue("my_terms are:\t{my_terms}")
# filter table
mapping %>%
  dplyr::filter(namespace_1003 == "biological_process", name_1006 %in% my_terms) -> placeholder
#
placeholder %>%
  dplyr::select(hgnc_symbol, name_1006) %>%
  dplyr::mutate(value = 1) %>%
  tidyr::pivot_wider(names_from=name_1006, values_fn=min ) -> my_biomart

my_biomart %>%
  as.data.frame -> my_biomart_df
rownames(my_biomart_df) <- my_biomart_df$hgnc_symbol
# replace NA with 0
my_biomart_df[is.na(my_biomart_df)] <- 0

```

set color code for heatmap

```{r}
# set minimum
min_val <- -10
# set max
max_val <- +10
# Sets the minimum (0), the maximum (15), and the increasing steps (+1) for the color scale
# Note: if some of your genes are outside of this range, they will appear white on the heatmap
breaksList = seq(min_val, max_val, by = 0.1)
# Defines the vector of colors for the legend (it has to be of the same lenght of breaksList)
color_code = colorRampPalette(c("blue", "white", "red"))(length(breaksList))
```

make pheatmap

```{r, eval=FALSE}
# create categorical color vector for terms
color_vctr <- c( "N" = "white", "Y" = "red")
annoCol <- rep(list(color_vctr), 10)
names(annoCol) <- my_terms
# draw heatmap
pheatmap::pheatmap(
  my_matrix,
  scale="row",
  cluster_rows=T,
  cluster_cols=T,
  show_colnames=FALSE,
  color=color_code,
  breaks=breaksList,
  cellwidth=3,
  cellheight=5,
  fontsize_row=4,
  fontsize_col=4,
  treeheight_row=0.0,
  treeheight_col=0.0,
  annotation_row = my_biomart_df %>% dplyr::select(-hgnc_symbol),
  annotation_col = my_metadata %>% dplyr::select(ARMCD),
  annotation_colors = annoCol,
  annotation_names_row=TRUE,
  annotation_legend=FALSE,
#  annotation_row=my_annotation %>% dplyr::select(EntrezGeneSymbol),
  filename=fs::path(output_img, "hm_common.png"),
  #width = 5,
  #height = 5
  )
```
	#### BiomaRt Biotype Analysis

	See BioMartR documentation [here](https://cran.r-project.org/web/packages/biomartr/vignettes/Functional_Annotation.html)

	Load ensembl db annotation.

	```{r}
	# load ensembl
	ensembl <- useEnsembl(
	biomart = "genes",
	dataset = "hsapiens_gene_ensembl"
	)
	```

	retrieve annotation for select proteins.

	```{r}
	mapping <- getBM(
	attributes = c(
	'ensembl_gene_id',
	#'ensembl_transcript_id',
	'hgnc_symbol',
	"go_id", "namespace_1003", "name_1006"),
	filters = 'hgnc_symbol',
	values = my_annotation$EntrezGeneSymbol %>% unique,
	mart = ensembl
	)
	# map
	mapping
	```

	Sort Biomart Annotation and filter for the three most common terms.

	```{r}
	# sort and filter
	mapping %>%
	# look for biological processes only
	dplyr::filter(namespace_1003 == "biological_process") %>%
	dplyr::group_by(name_1006) %>%
	dplyr::tally() %>%
	dplyr::arrange(dplyr::desc(n)) %>%
	dplyr::slice_head(n=40) %>%
	dplyr::pull(name_1006) -> my_terms
	# print out
	glue::glue("my_terms are:\t{my_terms}")
	# filter table
	mapping %>%
	dplyr::filter(namespace_1003 == "biological_process", name_1006 %in% my_terms) -> placeholder
	#
	placeholder %>%
	dplyr::select(hgnc_symbol, name_1006) %>%
	dplyr::mutate(value = 1) %>%
	tidyr::pivot_wider(names_from=name_1006, values_fn=min ) -> my_biomart

	my_biomart %>%
	as.data.frame -> my_biomart_df
	rownames(my_biomart_df) <- my_biomart_df$hgnc_symbol
	# replace NA with 0
	my_biomart_df[is.na(my_biomart_df)] <- 0

	```

	set color code for heatmap

	```{r}
	# set minimum
	min_val <- -10
	# set max
	max_val <- +10
	# Sets the minimum (0), the maximum (15), and the increasing steps (+1) for the color scale
	# Note: if some of your genes are outside of this range, they will appear white on the heatmap
	breaksList = seq(min_val, max_val, by = 0.1)
	# Defines the vector of colors for the legend (it has to be of the same lenght of breaksList)
	color_code = colorRampPalette(c("blue", "white", "red"))(length(breaksList))
	```

	make pheatmap

	```{r, eval=FALSE}
	# create categorical color vector for terms
	color_vctr <- c( "N" = "white", "Y" = "red")
	annoCol <- rep(list(color_vctr), 10)
	names(annoCol) <- my_terms
	# draw heatmap
	pheatmap::pheatmap(
	my_matrix,
	scale="row",
	cluster_rows=T,
	cluster_cols=T,
	show_colnames=FALSE,
	color=color_code,
	breaks=breaksList,
	cellwidth=3,
	cellheight=5,
	fontsize_row=4,
	fontsize_col=4,
	treeheight_row=0.0,
	treeheight_col=0.0,
	annotation_row = my_biomart_df %>% dplyr::select(-hgnc_symbol),
	annotation_col = my_metadata %>% dplyr::select(ARMCD),
	annotation_colors = annoCol,
	annotation_names_row=TRUE,
	annotation_legend=FALSE,
	# annotation_row=my_annotation %>% dplyr::select(EntrezGeneSymbol),
	filename=fs::path(output_img, "hm_common.png"),
	#width = 5,
	#height = 5
	)
	```