Albert Vill acvill

## CRISPRviewR_fix.Rmd
---
title: "CRISPRviewR updated functions"
author: "Albert Vill"
date: "24 Nov 2024"
output:
  pdf_document: default
  html_notebook: default
---

# Version Information

## tri2sym.R
tri2sym <-
  function(m, triangle) {
    if (!is.numeric(m)) {
      message("matrix must be numeric")
      stop()
    }
    if (dim(m)[1] != dim(m)[2]) {
      message("matrix must be square")
      stop()
    }

## entrez2dss.R
entrez2dss <-
  function(id_list) {

    require(rentrez)    # v1.2.3
    require(Biostrings) # v2.66.0
    require(stringr)    # v1.5.0

    # fetch all sequences, trim empty elements
    raw_ <-
      entrez_fetch(db = "nucleotide",

## make_unambiguous.R
make_unambiguous <- function(dna) {
  require(tidyverse)
  require(S4Vectors)
  iupac <- tibble(code = c("A", "C", "G", "T",
                           "R", "Y", "S", "W",
                           "K", "M", "B", "D",
                           "H", "V", "N"),
                  base = c("A", "C", "G", "T",
                           "AG", "CT", "GC", "AT",
                           "GT", "AC", "CGT", "AGT",

## plot_vcf.R
# Given vcf and gff, plot position and allele freq. of SNPs and indels over ranges
# tested with VCFv4.1 and gff version 3
# expects certain vcf and gff attributes, may not work with all files

plot_vcf <- function(vcf, gff, ranges) {

  require(tidyverse) # v2.0.0
  require(gggenes) # v0.5.0
  require(patchwork) # v1.1.2


## parse_STREME.R
# assumes fixed number of lines in header and footer
# might change with different versions of MEME suite

read_PWMs <- function(file) {
  pset <- head(readLines(con = file, n = -10)[-c(1:29)], -10)
  pset <- subset(pset, !grepl(pattern = "letter", pset))
  i1 <- !nzchar(pset)
  pset <- unname(split(pset[!i1], cumsum(i1)[!i1]))
  names(pset) <- gsub(pattern = " ",
                      replacement = "_",

## forna_colors.R
# requires the native pipe from R >= 4.1
## maps each base to a particular color
## colors can be named or hex values
## outputs a format recognized by the forna webapp's custom color editor
## http://rna.tbi.univie.ac.at/forna/forna.html

forna_colors <- function(sequence, colormap) {
  inseq <- gsub(x = strsplit(toupper(as.character(sequence)),split="")[[1]],
                pattern="T",
                replacement="U")

## msa_motifs.R
msa_motifs <- function(msa, motifs) {
  require(tibble)
  require(Biostrings)
  require(dplyr)
  # read in fasta as Biostrings object
  seqs <- Biostrings::readAAStringSet(filepath = msa, format = "fasta")
  maxseqlen <- max(width(seqs))
  # initiate results tibble
  results <- tibble::tibble(position = seq(1:maxseqlen))
  # readAAMultipleAlignment requires equal length of sequences

## find_combine_sort.sh
#!/bin/bash

# FUNCTION
## this script recursively searches a directory for fasta files matching a pattern
## found files are concatenated and sorted by descending sequence length

# INPUT
## first positional parameter is the directory to search
## second is the pattern to match in fasta filenames
## third is the output filename

## gtf2tibble.R
# tested with R v4.2.0 and tidyverse v2.0.0
read_gtf <- function(file) {

  require(tidyverse)
  cnames <- c("seqname","source","feature","start","end","score","strand","frame","attribute")

  # read in raw gtf as tsv and remove comment rows
  messy <- read_tsv(file, col_names = cnames, comment = "#")

  # get the unique attribute types
	---
	title: "CRISPRviewR updated functions"
	author: "Albert Vill"
	date: "24 Nov 2024"
	output:
	pdf_document: default
	html_notebook: default
	---

	# Version Information
	tri2sym <-
	function(m, triangle) {
	if (!is.numeric(m)) {
	message("matrix must be numeric")
	stop()
	}
	if (dim(m)[1] != dim(m)[2]) {
	message("matrix must be square")
	stop()
	}
	entrez2dss <-
	function(id_list) {

	require(rentrez) # v1.2.3
	require(Biostrings) # v2.66.0
	require(stringr) # v1.5.0

	# fetch all sequences, trim empty elements
	raw_ <-
	entrez_fetch(db = "nucleotide",
	make_unambiguous <- function(dna) {
	require(tidyverse)
	require(S4Vectors)
	iupac <- tibble(code = c("A", "C", "G", "T",
	"R", "Y", "S", "W",
	"K", "M", "B", "D",
	"H", "V", "N"),
	base = c("A", "C", "G", "T",
	"AG", "CT", "GC", "AT",
	"GT", "AC", "CGT", "AGT",
	# Given vcf and gff, plot position and allele freq. of SNPs and indels over ranges
	# tested with VCFv4.1 and gff version 3
	# expects certain vcf and gff attributes, may not work with all files

	plot_vcf <- function(vcf, gff, ranges) {

	require(tidyverse) # v2.0.0
	require(gggenes) # v0.5.0
	require(patchwork) # v1.1.2
	# assumes fixed number of lines in header and footer
	# might change with different versions of MEME suite

	read_PWMs <- function(file) {
	pset <- head(readLines(con = file, n = -10)[-c(1:29)], -10)
	pset <- subset(pset, !grepl(pattern = "letter", pset))
	i1 <- !nzchar(pset)
	pset <- unname(split(pset[!i1], cumsum(i1)[!i1]))
	names(pset) <- gsub(pattern = " ",
	replacement = "_",
	# requires the native pipe from R >= 4.1
	## maps each base to a particular color
	## colors can be named or hex values
	## outputs a format recognized by the forna webapp's custom color editor
	## http://rna.tbi.univie.ac.at/forna/forna.html

	forna_colors <- function(sequence, colormap) {
	inseq <- gsub(x = strsplit(toupper(as.character(sequence)),split="")[[1]],
	pattern="T",
	replacement="U")
	msa_motifs <- function(msa, motifs) {
	require(tibble)
	require(Biostrings)
	require(dplyr)
	# read in fasta as Biostrings object
	seqs <- Biostrings::readAAStringSet(filepath = msa, format = "fasta")
	maxseqlen <- max(width(seqs))
	# initiate results tibble
	results <- tibble::tibble(position = seq(1:maxseqlen))
	# readAAMultipleAlignment requires equal length of sequences
	#!/bin/bash

	# FUNCTION
	## this script recursively searches a directory for fasta files matching a pattern
	## found files are concatenated and sorted by descending sequence length

	# INPUT
	## first positional parameter is the directory to search
	## second is the pattern to match in fasta filenames
	## third is the output filename
	# tested with R v4.2.0 and tidyverse v2.0.0
	read_gtf <- function(file) {

	require(tidyverse)
	cnames <- c("seqname","source","feature","start","end","score","strand","frame","attribute")

	# read in raw gtf as tsv and remove comment rows
	messy <- read_tsv(file, col_names = cnames, comment = "#")

	# get the unique attribute types