Skip to content

Instantly share code, notes, and snippets.

View acvill's full-sized avatar
🪗

Albert Vill acvill

🪗
View GitHub Profile
@acvill
acvill / tri2sym.R
Last active October 24, 2023 15:50
create a symmetric matrix by reflecting the top or bottom triangle
tri2sym <-
function(m, triangle) {
if (!is.numeric(m)) {
message("matrix must be numeric")
stop()
}
if (dim(m)[1] != dim(m)[2]) {
message("matrix must be square")
stop()
}
@acvill
acvill / entrez2dss.R
Created October 17, 2023 13:43
Given a list of nucleotide IDs, fetch sequences and write to DNAStringSet
entrez2dss <-
function(id_list) {
require(rentrez) # v1.2.3
require(Biostrings) # v2.66.0
require(stringr) # v1.5.0
# fetch all sequences, trim empty elements
raw_ <-
entrez_fetch(db = "nucleotide",
@acvill
acvill / make_unambiguous.R
Created August 16, 2023 16:35
Given a DNA string with ambiguous IUPAC characters, print all the possible unambiguous DNA strings
make_unambiguous <- function(dna) {
require(tidyverse)
require(S4Vectors)
iupac <- tibble(code = c("A", "C", "G", "T",
"R", "Y", "S", "W",
"K", "M", "B", "D",
"H", "V", "N"),
base = c("A", "C", "G", "T",
"AG", "CT", "GC", "AT",
"GT", "AC", "CGT", "AGT",
@acvill
acvill / plot_vcf.R
Last active June 6, 2023 21:05
Given vcf and gff, plot position and allele freq. of SNPs and indels over genomic ranges
# Given vcf and gff, plot position and allele freq. of SNPs and indels over ranges
# tested with VCFv4.1 and gff version 3
# expects certain vcf and gff attributes, may not work with all files
plot_vcf <- function(vcf, gff, ranges) {
require(tidyverse) # v2.0.0
require(gggenes) # v0.5.0
require(patchwork) # v1.1.2
@acvill
acvill / parse_STREME.R
Last active May 30, 2023 13:41
read motifs in STREME text format to a named list of position-weight matrices
# assumes fixed number of lines in header and footer
# might change with different versions of MEME suite
read_PWMs <- function(file) {
pset <- head(readLines(con = file, n = -10)[-c(1:29)], -10)
pset <- subset(pset, !grepl(pattern = "letter", pset))
i1 <- !nzchar(pset)
pset <- unname(split(pset[!i1], cumsum(i1)[!i1]))
names(pset) <- gsub(pattern = " ",
replacement = "_",
@acvill
acvill / forna_colors.R
Last active May 25, 2023 17:05
given a nucleotide sequence, generate an index-color string recognized by forna
# requires the native pipe from R >= 4.1
## maps each base to a particular color
## colors can be named or hex values
## outputs a format recognized by the forna webapp's custom color editor
## http://rna.tbi.univie.ac.at/forna/forna.html
forna_colors <- function(sequence, colormap) {
inseq <- gsub(x = strsplit(toupper(as.character(sequence)),split="")[[1]],
pattern="T",
replacement="U")
@acvill
acvill / msa_motifs.R
Last active December 6, 2022 15:22
Given a MSA and a list of motifs, get position in each sequence where motifs are found.
msa_motifs <- function(msa, motifs) {
require(tibble)
require(Biostrings)
require(dplyr)
# read in fasta as Biostrings object
seqs <- Biostrings::readAAStringSet(filepath = msa, format = "fasta")
maxseqlen <- max(width(seqs))
# initiate results tibble
results <- tibble::tibble(position = seq(1:maxseqlen))
# readAAMultipleAlignment requires equal length of sequences
#!/bin/bash
# FUNCTION
## this script recursively searches a directory for fasta files matching a pattern
## found files are concatenated and sorted by descending sequence length
# INPUT
## first positional parameter is the directory to search
## second is the pattern to match in fasta filenames
## third is the output filename
@acvill
acvill / gtf2tibble.R
Last active May 31, 2023 14:39
function to read in gtf annotation files as tibbles
# tested with R v4.2.0 and tidyverse v2.0.0
read_gtf <- function(file) {
require(tidyverse)
cnames <- c("seqname","source","feature","start","end","score","strand","frame","attribute")
# read in raw gtf as tsv and remove comment rows
messy <- read_tsv(file, col_names = cnames, comment = "#")
# get the unique attribute types
@acvill
acvill / get_pvals.R
Last active December 6, 2022 15:27
Takes 2 tables, performs column-wise Wilcoxon test for each shared colname, generates p value table
require(tibble)
require(dplyr)
require(stats)
# make mock data, genes as columns
control <- tibble(subject = paste0("control", sprintf('%0.2d', 1:20)),
gene1 = rnorm(20, 10, 2),
gene2 = rnorm(20, 10, 2),
gene3 = rnorm(20, 10, 2))