Alon Shaiber ShaiberAlon

## convert-gdc-files-json-to-tsv.py
import json
import pandas
import argparse

parser = argparse.ArgumentParser(description='Convert GDC JSON with files details to a TSV with the columns: case_id, file_id, file_name')
parser.add_argument('-j', '--json', metavar='JSON', type=str,
                    help='JSON file with file details from GDC')
parser.add_argument('-o', '--output', metavar='TSV',
                    default='file-dict.tsv',
                    help='Path to output file')

## get-core-functions.py
#!/usr/bin/env python

import argparse
import pandas as pd

parser = argparse.ArgumentParser(description='Find the items that occur in all rows of a table (i.e. rows in which all values are greater than 0) and save to output file.')
parser.add_argument('-i', '--input', help='Input file.')
parser.add_argument('-o', '--output', help='Output file.')
parser.add_argument('--items-label', help='Header for the items column. If none is provided then there will be no header row.', default=None)
parser.add_argument('--index-label', help='Header for the index column. If none is provided then there will be no header row.', default=None)

## convert-frequencey-table-to-occurrence-table.py
#!/usr/bin/env python

import argparse
import pandas as pd

parser = argparse.ArgumentParser(description='Find the items that occur in all rows of a table (i.e. rows in which all values are greater than 0) and save to output file.')
parser.add_argument('-i', '--input', help='Input file.')
parser.add_argument('-o', '--output', help='Output file.')
args = parser.parse_args()

## get-gcs-of-core-functions.py
#!/usr/bin/env python

def main(args):
    import pandas as pd
    data = pd.read_csv(args.enrichment_data, sep='\t', index_col=0)
    name_dict = pd.read_csv(args.name_dict, sep='\t', index_col=0, header=None)
    core_funcs = pd.read_csv(args.core_functions, sep='\t', index_col=0)
    gcs_of_core_functions = []
    for func in core_funcs.index:
        original_func_names = list(name_dict.loc[name_dict[1]==func].index)

## summarize_blast_results.py
#!/usr/bin/env python

# Click 'Download > Multiple-file JSON' from NCBI search results page,
# unzip it, run this script in it, this way:
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt

import sys
import json
import glob

## gen-anvio-interactive-from-tabular-data.snake
# Thist script requires a config file (in JSON or YAML format) with this structure:
# {
#     "data": "path-to-tabular-data.txt",
#     "name": "project-name-to-use-as-prefix-for-output-files",
#     "metadata": "path-to-metadata-file.txt",
#     "transpose": "true-or-false-if-you-want-to-transpose-the-data",
#     "fix_item_names": "true-if-row-names-start-with-numeric",
#     "output_dirs": {
#         "LOGS_DIR": "00_LOGS",
#         "INTERACTIVE_DIR": "00_ANVIO_FIGURE",

## fix_functional_occurrence_table.py
#!/usr/bin/env python
# -*- coding: utf-8
def main(args):
    import pandas as pd
    import re

    input_file = args.input_file
    output_file = args.output_file
    data = pd.read_csv(input_file, sep='\t', index_col=False)

## gen-collection-for-merged-fasta.py
#!/usr/bin/env python
# To use this:
# python gen-collection-for-merged-fasta.py -f fasta.txt -o collection-file.txt

import sys

from anvio import fastalib
from anvio import utils
from anvio import filesnpaths
from anvio.errors import ConfigError, FilesNPathsError

## summarize_blast_results_of_anvio_HMMs.py
#!/usr/bin/env python

# Click 'Download > Multiple-file JSON' from NCBI search results page,
# unzip it, run this script in it, this way:
# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt

import sys
import json
import glob

## SPLIT-FASTA.py
#!/usr/bin/env python
'''
split fasta file into multiple smaller fasta files

Use like this:

python SPLIT-FASTA.py fasta-name.fa output-prefix SIZE

So if your input fasta was contigs.fa, and had 190 sequences then:
	import json
	import pandas
	import argparse

	parser = argparse.ArgumentParser(description='Convert GDC JSON with files details to a TSV with the columns: case_id, file_id, file_name')
	parser.add_argument('-j', '--json', metavar='JSON', type=str,
	help='JSON file with file details from GDC')
	parser.add_argument('-o', '--output', metavar='TSV',
	default='file-dict.tsv',
	help='Path to output file')
	#!/usr/bin/env python

	import argparse
	import pandas as pd

	parser = argparse.ArgumentParser(description='Find the items that occur in all rows of a table (i.e. rows in which all values are greater than 0) and save to output file.')
	parser.add_argument('-i', '--input', help='Input file.')
	parser.add_argument('-o', '--output', help='Output file.')
	parser.add_argument('--items-label', help='Header for the items column. If none is provided then there will be no header row.', default=None)
	parser.add_argument('--index-label', help='Header for the index column. If none is provided then there will be no header row.', default=None)
	#!/usr/bin/env python

	def main(args):
	import pandas as pd
	data = pd.read_csv(args.enrichment_data, sep='\t', index_col=0)
	name_dict = pd.read_csv(args.name_dict, sep='\t', index_col=0, header=None)
	core_funcs = pd.read_csv(args.core_functions, sep='\t', index_col=0)
	gcs_of_core_functions = []
	for func in core_funcs.index:
	original_func_names = list(name_dict.loc[name_dict[1]==func].index)
	#!/usr/bin/env python

	# Click 'Download > Multiple-file JSON' from NCBI search results page,
	# unzip it, run this script in it, this way:
	# python summarize_blast_results_of_anvio_HMMs.py OUTPUT_FILE.txt

	import sys
	import json
	import glob
	# Thist script requires a config file (in JSON or YAML format) with this structure:
	# {
	# "data": "path-to-tabular-data.txt",
	# "name": "project-name-to-use-as-prefix-for-output-files",
	# "metadata": "path-to-metadata-file.txt",
	# "transpose": "true-or-false-if-you-want-to-transpose-the-data",
	# "fix_item_names": "true-if-row-names-start-with-numeric",
	# "output_dirs": {
	# "LOGS_DIR": "00_LOGS",
	# "INTERACTIVE_DIR": "00_ANVIO_FIGURE",
	#!/usr/bin/env python
	# -*- coding: utf-8
	def main(args):
	import pandas as pd
	import re

	input_file = args.input_file
	output_file = args.output_file
	data = pd.read_csv(input_file, sep='\t', index_col=False)
	#!/usr/bin/env python
	# To use this:
	# python gen-collection-for-merged-fasta.py -f fasta.txt -o collection-file.txt

	import sys

	from anvio import fastalib
	from anvio import utils
	from anvio import filesnpaths
	from anvio.errors import ConfigError, FilesNPathsError
	#!/usr/bin/env python
	'''
	split fasta file into multiple smaller fasta files

	Use like this:

	python SPLIT-FASTA.py fasta-name.fa output-prefix SIZE

	So if your input fasta was contigs.fa, and had 190 sequences then: