Skip to content

Instantly share code, notes, and snippets.

View ShaiberAlon's full-sized avatar

Alon Shaiber ShaiberAlon

  • University of Chicago
  • Chicago
View GitHub Profile
@ShaiberAlon
ShaiberAlon / convert-gdc-files-json-to-tsv.py
Created June 25, 2021 20:25
Convert GDC JSON with files details to a TSV with the columns: case_id, file_id, file_name
import json
import pandas
import argparse
parser = argparse.ArgumentParser(description='Convert GDC JSON with files details to a TSV with the columns: case_id, file_id, file_name')
parser.add_argument('-j', '--json', metavar='JSON', type=str,
help='JSON file with file details from GDC')
parser.add_argument('-o', '--output', metavar='TSV',
default='file-dict.tsv',
help='Path to output file')
#!/usr/bin/env python
import argparse
import pandas as pd
parser = argparse.ArgumentParser(description='Find the items that occur in all rows of a table (i.e. rows in which all values are greater than 0) and save to output file.')
parser.add_argument('-i', '--input', help='Input file.')
parser.add_argument('-o', '--output', help='Output file.')
parser.add_argument('--items-label', help='Header for the items column. If none is provided then there will be no header row.', default=None)
parser.add_argument('--index-label', help='Header for the index column. If none is provided then there will be no header row.', default=None)
#!/usr/bin/env python
import argparse
import pandas as pd
parser = argparse.ArgumentParser(description='Find the items that occur in all rows of a table (i.e. rows in which all values are greater than 0) and save to output file.')
parser.add_argument('-i', '--input', help='Input file.')
parser.add_argument('-o', '--output', help='Output file.')
args = parser.parse_args()
#!/usr/bin/env python
def main(args):
import pandas as pd
data = pd.read_csv(args.enrichment_data, sep='\t', index_col=0)
name_dict = pd.read_csv(args.name_dict, sep='\t', index_col=0, header=None)
core_funcs = pd.read_csv(args.core_functions, sep='\t', index_col=0)
gcs_of_core_functions = []
for func in core_funcs.index:
original_func_names = list(name_dict.loc[name_dict[1]==func].index)
@ShaiberAlon
ShaiberAlon / summarize_blast_results.py
Last active March 5, 2020 02:10
Summarize blast results
#!/usr/bin/env python
# Click 'Download > Multiple-file JSON' from NCBI search results page,
# unzip it, run this script in it, this way:
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt
import sys
import json
import glob
@ShaiberAlon
ShaiberAlon / gen-anvio-interactive-from-tabular-data.snake
Last active May 17, 2019 04:17
snakemake script to generate interactive view of tabular data in anvi'o
# Thist script requires a config file (in JSON or YAML format) with this structure:
# {
# "data": "path-to-tabular-data.txt",
# "name": "project-name-to-use-as-prefix-for-output-files",
# "metadata": "path-to-metadata-file.txt",
# "transpose": "true-or-false-if-you-want-to-transpose-the-data",
# "fix_item_names": "true-if-row-names-start-with-numeric",
# "output_dirs": {
# "LOGS_DIR": "00_LOGS",
# "INTERACTIVE_DIR": "00_ANVIO_FIGURE",
#!/usr/bin/env python
# -*- coding: utf-8
def main(args):
import pandas as pd
import re
input_file = args.input_file
output_file = args.output_file
data = pd.read_csv(input_file, sep='\t', index_col=False)
#!/usr/bin/env python
# To use this:
# python gen-collection-for-merged-fasta.py -f fasta.txt -o collection-file.txt
import sys
from anvio import fastalib
from anvio import utils
from anvio import filesnpaths
from anvio.errors import ConfigError, FilesNPathsError
#!/usr/bin/env python
# Click 'Download > Multiple-file JSON' from NCBI search results page,
# unzip it, run this script in it, this way:
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt
import sys
import json
import glob
@ShaiberAlon
ShaiberAlon / SPLIT-FASTA.py
Created October 8, 2018 17:58
Split fasta into multiple fasta files with a max size
#!/usr/bin/env python
'''
split fasta file into multiple smaller fasta files
Use like this:
python SPLIT-FASTA.py fasta-name.fa output-prefix SIZE
So if your input fasta was contigs.fa, and had 190 sequences then: