Skip to content

Instantly share code, notes, and snippets.

View audy's full-sized avatar
🔬

Austin Richardson audy

🔬
View GitHub Profile
@audy
audy / Snakefile
Last active October 20, 2023 20:44
# Rule to count the number of lines in input.interleaved.fastq
rule count_lines:
input:
"input.interleaved.fastq"
output:
"line_count.txt"
shell:
"wc -l < {input} > {output}"
# Rule to convert paired-end FASTQ files to interleaved format
@audy
audy / ascii-to-dna.py
Created April 11, 2023 06:43
encode ascii text as DNA sequences
#!/usr/bin/env python3
from itertools import product
from typing import Dict, Tuple
import sys
def generate_codon_mapping(
dna_alphabet="GATC",
@audy
audy / download-assemblies-from-ncbi.py
Created April 3, 2023 02:30
quickly fetch assemblies from NCBI
#!/usr/bin/env python3
import os
import downloads
from joblib import Parallel, delayed
from itertools import islice
from tqdm import tqdm
def get_gbk_path(assembly) -> str:
"""
import taxonomy
tax = taxonomy.Taxonomy.from_ncbi("ncbi_taxdump/")
FULL_RANKS = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
def get_scalar(d: dict, key: str):
if key in d:
vals = d[key]
if len(vals) == 1:
>MN908947.3
ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCT
GTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTGCACT
CACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATC
TTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTT
CGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAAC
ACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGG
AGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGG
CTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAA
ACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACT
[
(None, {"id": "447834", "parent": "447833", "rank": "subspecies"}),
(None, {"id": "447833", "parent": "111897", "rank": "species"}),
(None, {"id": "111897", "parent": "1664845", "rank": "genus"}),
(None, {"id": "1664845", "parent": "42282", "rank": "tribe"}),
(None, {"id": "42282", "parent": "33415", "rank": "subfamily"}),
(None, {"id": "33415", "parent": "37572", "rank": "family"}),
(None, {"id": "37572", "parent": "104431", "rank": "superfamily"}),
(None, {"id": "104431", "parent": "37567", "rank": "clade"}),
(None, {"id": "37567", "parent": "41197", "rank": "clade"}),
@audy
audy / get-barcodes.py
Created July 2, 2020 18:16
check the frequency of the first N bases to try to identify barcode sequences in a non-demultiplexed fastq file
#!/usr/bin/env python3
# usage: cat reads.fastq | ./get-barcodes.py
from Bio import SeqIO
from collections import defaultdict
BARCODE_SIZE = 14
counts = defaultdict(lambda: 0)
model STRING NULLABLE
taxonomy_name STRING NULLABLE
description STRING NULLABLE
title STRING NULLABLE
gsm STRING NULLABLE
attributes STRING REPEATED
dbgap STRING NULLABLE
attribute_recs RECORD REPEATED
attribute_recs. unit STRING NULLABLE
attribute_recs. display_name STRING NULLABLE
>AY179509.1 Mink astrovirus, complete genome
CCGAAGTAGGTGTGTGTGTTGCCGTTATGGCTAACAACACTACCAGCGCTCTTCACCCTCGTGGCTCTGGCCAGCGCTGT
GTCTATGACACAGTGCTCCGGTTTGGGGACCCCGATGCACGTCGCAGGGGTTTCCAATTGGACGAGGTGTCACATAATAA
GTTGTGTGACATTTTTGACAGCGGCCCGCTCCACTTCGCTTTTGGTGATCTTAAAGTGATGAAGGTGGCGGGTGGTGTGG
TCACACCGCATAAAACAGTTGTCAAAACAGTCTATGTCTCAGGTGTTCAAGAGGGTAACGATTATGTCACTTTTGCCTTC
ACGCCTGGACCTAACGAGTGGCGCGAAGTTGATCCCCGCATCGACAAGCGCACAGCACTCGTCGGTGTCCTTGTGCAAGA
ACATAAAAAATTGGACTCAGACCTTAAGGAGTCGCGCCGTGAGTTGTCCCAGCTCAAGTTGGAGCACTCACTGTTGAGAC
ATGACTATGAGCGCTTGGTCCGTGAAAAGCCTGGTCCTGCTATGAGAACTTTTAAATTCTCAGCTGTCATCTTTTATGCG
TTTTTCCTTGGTTTCCTGCTTATGTCTGCTGTCAAGGGTGAGGTGTATGGTCGCTGTCTTGACAGCGAGCTTAACCTCAA
TGGCAACCCTGAAGTGTGTTTGCATTGGGAAGAGGTTAAATCTTTTAGCCTCCAGGTTGCCCTTGCAGACTTCTGGAACA
class BaseKitty
def meow
nil
end
end
class Nancy < BaseKitty
def meow
(super || "shshsh")
end