Ryan Dale daler

## make-gtf-db.py
"""
https://www.biostars.org/p/152517/

Example of how to work with Ensembl release 81 GTF files, which:

    1) already have genes and transcripts included

    2) have unique IDs for genes, transcripts, and exons in the corresponding
       "<featuretype>_id" attribute

## prepare-rRNA.sh
#!/usr/bin/env bash

# Ryan Dale, July 2015
# dalerr@niddk.nih.gov
#
# CollectRnaSeqMetrics.jar from Picard [1] needs an interval list corresponding
# to ribosomal RNA. The format is described at [2].
#
# SAM header creation idea from [3]; idea for using rmsk tables to get rRNA is
# from [4].

## LICENSE
The MIT License (MIT)

Copyright (c) 2016 Ryan Dale

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

## gffparse_example.py
from gffutils.iterators import DataIterator

input_filename = 'example.gff'
output_filename = 'output.gff'

with open(output_filename, 'w') as fout:
    for feature in DataIterator(input_filename):
        # len() works to get the length of a feature in bp
        if len(feature) < 1000:
            continue

## deploy.sh
#!/bin/bash
set -e
set -o pipefail

# All-in-one installation script to download, configure, and run cloudbiolinux
# to install bioinformatics tools locally without needing sudo. The executables
# will go into $INSTALL_DIR:
INSTALL_DIR=~/tmp/cbl_demo

# See https://github.com/chapmanb/cloudbiolinux for more info on customizing

## dashes-fasta.py
import pybedtools

# This demo uses files that ship with pybedtools
a = pybedtools.example_bedtool('a.bed')
fasta = pybedtools.example_filename('test.fa')

# Use a properly-formatted BED file, and then post-process the resulting fasta.
x = a.sequence(fi=fasta, s=True)
for i, line in enumerate(open(x.seqfn)):
    if line.startswith('>') and i >0:

## genome-minimal-fail.gff
##gff-version 3
scaffold_28	prediction	gene	1	402	0	+	.	ID=545184;Name=545184
scaffold_28	prediction	gene	805	981	0	-	.	ID=93782;Name=93782
scaffold_28	prediction	gene	2030	2721	0	+	.	ID=545205;Name=545205
scaffold_28	prediction	gene	3273	3545	0	-	.	Name=YOL159C-A;Synteny=no_synteny;SystematicGeneName=YOL159C-A;ID=38792
scaffold_28	prediction	gene	5318	5833	0	-	.	Name=YOL159C;Synteny=no_synteny;SystematicGeneName=YOL159C;ID=38793
scaffold_28	prediction	gene	6780	8600	0	-	.	Name=ENB1;Synteny=no_synteny;SystematicGeneName=YOL158C;StandardGeneName=ENB1;ID=38794
scaffold_28	prediction	gene	9698	11467	0	-	.	Name=IMA4;Synteny=no_synteny;SystematicGeneName=YJL221C;StandardGeneName=IMA4;ID=38795

## README.rst

      
              5 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                daler
                / README.rst
            
            
              Last active
              November 6, 2019 23:44
            
              
                example data for metaseq_demo
              
          
    This gist provides example data for the metaseq_demo.py script


## pybedtools-issue-110.py
import pybedtools
import pandas


def split_coverage(x):
    """
    Split a coverage file created using bedtools coverage -hist -- which will
    have trailing "all" hist lines -- into 1) a BedTool object with valid BED
    lines and 2) a pandas DataFrame of all coverage, parsed from the trailing
    "all" lines.

## ftpes-downloader.py
#!/usr/bin/env python

import os
import sys
import platform
import argparse
import ftplib
import time
import netrc
from fnmatch import fnmatch
	"""
	https://www.biostars.org/p/152517/

	Example of how to work with Ensembl release 81 GTF files, which:

	1) already have genes and transcripts included

	2) have unique IDs for genes, transcripts, and exons in the corresponding
	"<featuretype>_id" attribute
	#!/usr/bin/env bash

	# Ryan Dale, July 2015
	# dalerr@niddk.nih.gov
	#
	# CollectRnaSeqMetrics.jar from Picard [1] needs an interval list corresponding
	# to ribosomal RNA. The format is described at [2].
	#
	# SAM header creation idea from [3]; idea for using rmsk tables to get rRNA is
	# from [4].
	The MIT License (MIT)

	Copyright (c) 2016 Ryan Dale

	Permission is hereby granted, free of charge, to any person obtaining a copy
	of this software and associated documentation files (the "Software"), to deal
	in the Software without restriction, including without limitation the rights
	to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	copies of the Software, and to permit persons to whom the Software is
	furnished to do so, subject to the following conditions:
	from gffutils.iterators import DataIterator

	input_filename = 'example.gff'
	output_filename = 'output.gff'

	with open(output_filename, 'w') as fout:
	for feature in DataIterator(input_filename):
	# len() works to get the length of a feature in bp
	if len(feature) < 1000:
	continue
	#!/bin/bash
	set -e
	set -o pipefail

	# All-in-one installation script to download, configure, and run cloudbiolinux
	# to install bioinformatics tools locally without needing sudo. The executables
	# will go into $INSTALL_DIR:
	INSTALL_DIR=~/tmp/cbl_demo

	# See https://github.com/chapmanb/cloudbiolinux for more info on customizing
	import pybedtools

	# This demo uses files that ship with pybedtools
	a = pybedtools.example_bedtool('a.bed')
	fasta = pybedtools.example_filename('test.fa')

	# Use a properly-formatted BED file, and then post-process the resulting fasta.
	x = a.sequence(fi=fasta, s=True)
	for i, line in enumerate(open(x.seqfn)):
	if line.startswith('>') and i >0:
	##gff-version 3
	scaffold_28 prediction gene 1 402 0 + . ID=545184;Name=545184
	scaffold_28 prediction gene 805 981 0 - . ID=93782;Name=93782
	scaffold_28 prediction gene 2030 2721 0 + . ID=545205;Name=545205
	scaffold_28 prediction gene 3273 3545 0 - . Name=YOL159C-A;Synteny=no_synteny;SystematicGeneName=YOL159C-A;ID=38792
	scaffold_28 prediction gene 5318 5833 0 - . Name=YOL159C;Synteny=no_synteny;SystematicGeneName=YOL159C;ID=38793
	scaffold_28 prediction gene 6780 8600 0 - . Name=ENB1;Synteny=no_synteny;SystematicGeneName=YOL158C;StandardGeneName=ENB1;ID=38794
	scaffold_28 prediction gene 9698 11467 0 - . Name=IMA4;Synteny=no_synteny;SystematicGeneName=YJL221C;StandardGeneName=IMA4;ID=38795
	import pybedtools
	import pandas


	def split_coverage(x):
	"""
	Split a coverage file created using bedtools coverage -hist -- which will
	have trailing "all" hist lines -- into 1) a BedTool object with valid BED
	lines and 2) a pandas DataFrame of all coverage, parsed from the trailing
	"all" lines.
	#!/usr/bin/env python

	import os
	import sys
	import platform
	import argparse
	import ftplib
	import time
	import netrc
	from fnmatch import fnmatch