Vinh Tran trvinh

## find_isoforms.py
#!/usr/bin/env python3
import argparse
import pandas as pd


def find_clusters(df: pd.DataFrame, protein_list: list) -> pd.DataFrame:
    """
    Given a dataframe and a list of protein IDs,
    return groups of proteins that belong to the same cluster (per chromosome).
    """

## gff_parser.py
import pandas as pd
import re
import argparse

def parse_gff(gff_file):
    """
    Parse GFF file and extract CDS with protein ID, gene locus, mRNA, chrom, strand, positions.
    """
    records = []

## create_core_hmm.py
#!/bin/env python

import os
import sys
import argparse
from pathlib import Path
import subprocess
import multiprocessing as mp
from tqdm import tqdm

## install_phyloprofile.txt
# create new conda env
mamba create -n phyloprofile_v1.20 r-base pkg-config pkgconfig fontconfig gsl lxml
# activate that env and start an R terminal
mamba activate phyloprofile_v1.20
R
# install phyloprofile from bioconductor
install.packages("BiocManager")
BiocManager::install("PhyloProfile")
# or install dev version from github
install.packages("devtools")

## split_multi_domains_file.R
library(data.table)
library(dplyr)

#' Split a multi ortholo group file into single files
splitDomainFile <- function(domainFile = NULL, outPath = NULL) {
    if (is.null(domainFile)) stop("Domain file cannot be NULL")
    if (is.null(outPath)) stop("Output path cannot be NULL")

    df <- fread(
        domainFile, header = TRUE, stringsAsFactors = FALSE, sep = "\t"

## update_ete_ncbi.py
from ete3 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()

## process_taxDB.R
library(PhyloProfile)

processNcbiTaxonomy <- function(taxdmpfile = NULL) {
    if (is.null(taxdmpfile) || !file.exists(taxdmpfile)) {
        stop("taxdmp.zip file invalid!")
    } else temp <- taxdmpfile

    names <- utils::read.table(
        unz(temp, "names.dmp"), header = FALSE, fill = TRUE, sep = "\t",
        quote = "", comment.char = "", stringsAsFactors = FALSE

## combine_fasta.py
# -*- coding: utf-8 -*-

from Bio import SeqIO
import argparse
import shutil

def combine_fa(fa_1, fa_2, out_file):
    """ Combine 2 fasta files """
    new_fa_dict = SeqIO.to_dict(SeqIO.parse(open(fa_2),'fasta'))
    existing_seq = SeqIO.to_dict(SeqIO.parse(open(fa_1),'fasta'))

## update_data_pp.txt
library(PhyloProfile)

setwd('PhyloProfile/data')

# load data
data(taxonNamesReduced)
# modify the dataframe
# for example, rename Actinobacteria to Actinomycetota
taxonNamesReduced$fullName[
    taxonNamesReduced$rank == "phylum" & taxonNamesReduced$ncbiID == 201174

## use_timeit.py
import timeit

def test(st,en):
    return random.randint(st, en)

t = timeit.Timer(lambda: test(10, 100))
print(t.timeit(10))
	#!/usr/bin/env python3
	import argparse
	import pandas as pd


	def find_clusters(df: pd.DataFrame, protein_list: list) -> pd.DataFrame:
	"""
	Given a dataframe and a list of protein IDs,
	return groups of proteins that belong to the same cluster (per chromosome).
	"""
	import pandas as pd
	import re
	import argparse

	def parse_gff(gff_file):
	"""
	Parse GFF file and extract CDS with protein ID, gene locus, mRNA, chrom, strand, positions.
	"""
	records = []
	#!/bin/env python

	import os
	import sys
	import argparse
	from pathlib import Path
	import subprocess
	import multiprocessing as mp
	from tqdm import tqdm
	# create new conda env
	mamba create -n phyloprofile_v1.20 r-base pkg-config pkgconfig fontconfig gsl lxml
	# activate that env and start an R terminal
	mamba activate phyloprofile_v1.20
	R
	# install phyloprofile from bioconductor
	install.packages("BiocManager")
	BiocManager::install("PhyloProfile")
	# or install dev version from github
	install.packages("devtools")
	library(data.table)
	library(dplyr)

	#' Split a multi ortholo group file into single files
	splitDomainFile <- function(domainFile = NULL, outPath = NULL) {
	if (is.null(domainFile)) stop("Domain file cannot be NULL")
	if (is.null(outPath)) stop("Output path cannot be NULL")

	df <- fread(
	domainFile, header = TRUE, stringsAsFactors = FALSE, sep = "\t"
	from ete3 import NCBITaxa
	ncbi = NCBITaxa()
	ncbi.update_taxonomy_database()
	library(PhyloProfile)

	processNcbiTaxonomy <- function(taxdmpfile = NULL) {
	if (is.null(taxdmpfile) \|\| !file.exists(taxdmpfile)) {
	stop("taxdmp.zip file invalid!")
	} else temp <- taxdmpfile

	names <- utils::read.table(
	unz(temp, "names.dmp"), header = FALSE, fill = TRUE, sep = "\t",
	quote = "", comment.char = "", stringsAsFactors = FALSE
	# -- coding: utf-8 --

	from Bio import SeqIO
	import argparse
	import shutil

	def combine_fa(fa_1, fa_2, out_file):
	""" Combine 2 fasta files """
	new_fa_dict = SeqIO.to_dict(SeqIO.parse(open(fa_2),'fasta'))
	existing_seq = SeqIO.to_dict(SeqIO.parse(open(fa_1),'fasta'))
	library(PhyloProfile)

	setwd('PhyloProfile/data')

	# load data
	data(taxonNamesReduced)
	# modify the dataframe
	# for example, rename Actinobacteria to Actinomycetota
	taxonNamesReduced$fullName[
	taxonNamesReduced$rank == "phylum" & taxonNamesReduced$ncbiID == 201174
	import timeit

	def test(st,en):
	return random.randint(st, en)

	t = timeit.Timer(lambda: test(10, 100))
	print(t.timeit(10))