Last active
March 25, 2019 13:05
-
-
Save yangwu91/79b74035465978e78d41af4236597ff1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
# Written by Wu, Yang <wuyang@vt.edu> | |
''' | |
Usage: | |
python stringtie_FPKM.py <stringtie_output_dir>|<sample_list> | |
For example: | |
python stringtie_FPKM.py /foo/stringtie_output/ | |
Or (it is better if the groups are ordered, i.e. Group1, Group2, Group3...): | |
python stringtie_FPKM.py sample_list.tsv | |
sample_list.tsv content (tabluar format): | |
/foo/Sample1/\tGroup1 | |
/foo/Sample2/\tGroup2 | |
/foo/Sample3/\tGroup3 | |
/foo/Sample4/\tGroup4 | |
''' | |
import sys, subprocess, os, re | |
from copy import deepcopy | |
try: | |
arg = sys.argv[1] | |
except IndexError: | |
print __doc__ | |
sys.exit(1) | |
if os.path.isfile(arg): | |
with open(arg, 'r') as inf: | |
filelist = [tuple(line.strip().split('\t')) for line in inf] | |
elif os.path.isdir(arg): | |
filelist = [(f, os.path.split(f)[0]. split('/')[-1]) \ | |
for f in subprocess.check_output('find %s -name "*.gtf" -type f | sort -u' % arg, | |
shell=True).strip().split('\n')] | |
print filelist | |
FPKM_dict = {} | |
groups = [] | |
i = 0 | |
max_len = 0 | |
total = len(filelist) | |
gene_id_p = r'gene_id "(.+?)";' | |
trx_id_p = r'transcript_id "(.+?)";' | |
ref_gene_name_p = r'ref_gene_name "(.+?)";' | |
FPKM_p = r'FPKM "(.+?)";' | |
#TPM_p = r'TPM "(.+?)";' | |
for item in filelist: | |
i += 1 | |
filename = item[0] | |
if os.path.isdir(filename): | |
if filename[-1] != '/': | |
filename += '/' | |
filename += subprocess.check_output('ls %s|grep "\.gtf$"' % filename, shell=True).strip() | |
group = item[-1] | |
info = "(%s/%s) %s" % (i, total, group) | |
if len(info) < max_len: | |
blank = max_len - len(info) | |
else: | |
max_len = len(info) | |
blank = 0 | |
print(info + ' '*blank + '\r'), | |
#sys.stdout.write('(%s/%s) %s\r' % (i, total, group)) | |
sys.stdout.flush() | |
groups.append(group) | |
with open(filename, 'r') as inf: | |
for line in inf: | |
line = line.strip().split('\t') | |
try: | |
if line[2] == "transcript": | |
info = line[-1] | |
# May vary: | |
#gene_id = re.findall(ref_gene_name_p, info) | |
gene_id = re.findall(gene_id_p, info) | |
trx_id = re.findall(trx_id_p, info) | |
FPKM = re.findall(FPKM_p, info) | |
if len(gene_id) == 1 and len(FPKM) == 1: | |
ID = "%s\t%s" % (gene_id[0], trx_id[0]) | |
FPKM = FPKM[0] | |
FPKM_stats = deepcopy(FPKM_dict.get(ID, {})) | |
FPKM_stats[group] = FPKM | |
FPKM_dict[gene_id] = deepcopy(FPKM_stats) | |
except IndexError: | |
pass | |
with open("FPKM_stats.tsv", 'w') as outf: | |
print "\nSummarizing..." | |
n = 0 | |
for item in sorted(FPKM_dict.iteritems()): | |
n += 1 | |
if n == 1: | |
outf.write("Gene_id\tTranscript_id\t" + '\t'.join(groups) + "\n") | |
FPKMs = [item[-1].get(g, "NA") for g in groups] | |
outf.write(item[0] + '\t' + "\t".join(FPKMs) + '\n') | |
print "All set." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment