dpryan79/human.py

## human.py
#!/usr/bin/env python
e2g = []
e2u = []
g2e = []
g2u = []
u2e = []
u2g = []
ifile = open("GCA_000001405.22_GRCh38.p7_assembly_report.txt", "r")
for line in ifile:
    if line.startswith("#"):
        continue
    cols = line.strip().split("\t")
    ctype = cols[1]
    if ctype == "assembled-molecule":
        e2g.append("{}\t{}\n".format(cols[2], cols[9]))
        g2e.append("{}\t{}\n".format(cols[9], cols[2]))
        e2u.append("{}\t{}\n".format(cols[2], cols[9]))
        u2e.append("{}\t{}\n".format(cols[9], cols[2]))
        u2g.append("{}\t{}\n".format(cols[9], cols[9]))
        g2u.append("{}\t{}\n".format(cols[9], cols[9]))
    elif ctype == "unlocalized-scaffold" or ctype == "unplaced-scaffold":
        #UCSC 9
        #gencode/ensembl 4
        e2g.append("{}\t{}\n".format(cols[4], cols[4]))
        g2e.append("{}\t{}\n".format(cols[4], cols[4]))
        e2u.append("{}\t{}\n".format(cols[4], cols[9]))
        u2e.append("{}\t{}\n".format(cols[9], cols[4]))
        u2g.append("{}\t{}\n".format(cols[9], cols[4]))
        g2u.append("{}\t{}\n".format(cols[4], cols[9]))
    elif ctype in ["fix-patch", "novel-patch"]:
        #gencode 4
        #ensembl CHR_0
        e2g.append("CHR_{}\t{}\n".format(cols[0], cols[4]))
        g2e.append("{}\tCHR_{}\n".format(cols[4], cols[0]))
        e2u.append("CHR_{}\t\n".format(cols[0]))
        g2u.append("{}\t\n".format(cols[4]))
    elif ctype == "alt-scaffold":
        #UCSC 9
        #gencode 4
        #ensembl CHR_0
        e2g.append("CHR_{}\t{}\n".format(cols[0], cols[4]))
        g2e.append("{}\tCHR_{}\n".format(cols[4], cols[0]))
        e2u.append("CHR_{}\t{}\n".format(cols[0], cols[9]))
        u2e.append("{}\tCHR_{}\n".format(cols[9], cols[0]))
        u2g.append("{}\t{}\n".format(cols[9], cols[4]))
        g2u.append("{}\t{}\n".format(cols[4], cols[9]))
    else:
        print("can't handle {}".format(ctype))

f = open("GRCh38_ensembl2gencode.txt", "w")
for line in sorted(e2g, key=lambda x: x.lower()):
    f.write(line)
f.close()

f = open("GRCh38_gencode2ensembl.txt", "w")
for line in sorted(g2e, key=lambda x: x.lower()):
    f.write(line)
f.close()

f = open("GRCh38_UCSC2gencode.txt", "w")
for line in sorted(u2g, key=lambda x: x.lower()):
    f.write(line)
f.close()

f = open("GRCh38_UCSC2ensembl.txt", "w")
for line in sorted(u2e, key=lambda x: x.lower()):
    f.write(line)
f.close()

f = open("GRCh38_ensembl2UCSC.txt", "w")
for line in sorted(e2u, key=lambda x: x.lower()):
    f.write(line)
f.close()

f = open("GRCh38_gencode2UCSC.txt", "w")
for line in sorted(g2u, key=lambda x: x.lower()):
    f.write(line)
f.close()

ifile.close()

## mouse.py
#!/usr/bin/env python
e2g = [] #open("GRCm38_ensembl2gencode.txt", "w")
e2u = [] #open("GRCm38_ensembl2UCSC.txt", "w")
g2e = [] #open("GRCm38_gencode2ensembl.txt", "w")
g2u = [] #open("GRCm38_gencode2UCSC.txt", "w")
u2e = [] #open("GRCm38_UCSC2ensembl.txt", "w")
u2g = [] #open("GRCm38_UCSC2gencode.txt", "w")
ifile = open("GCA_000001635.6_GRCm38.p4_assembly_report.txt", "r")
for line in ifile:
    if line.startswith("#"):
        continue
    cols = line.strip().split("\t")
    ctype = cols[1]
    if ctype == "assembled-molecule":
        e2g.append("{}\t{}\n".format(cols[2], cols[9]))
        g2e.append("{}\t{}\n".format(cols[9], cols[2]))
        e2u.append("{}\t{}\n".format(cols[2], cols[9]))
        u2e.append("{}\t{}\n".format(cols[9], cols[2]))
        u2g.append("{}\t{}\n".format(cols[9], cols[9]))
        g2u.append("{}\t{}\n".format(cols[9], cols[9]))
    elif ctype == "unlocalized-scaffold" or ctype == "unplaced-scaffold":
        #UCSC 9
        #gencode/ensembl 4
        e2g.append("{}\t{}\n".format(cols[4], cols[4]))
        g2e.append("{}\t{}\n".format(cols[4], cols[4]))
        e2u.append("{}\t{}\n".format(cols[4], cols[9]))
        u2e.append("{}\t{}\n".format(cols[9], cols[4]))
        u2g.append("{}\t{}\n".format(cols[9], cols[4]))
        g2u.append("{}\t{}\n".format(cols[4], cols[9]))
    elif ctype == "novel-patch" or ctype == "fix-patch":
        #UCSC ignore
        #gencode 4
        #ensembl CHR_0
        e2g.append("CHR_{}\t{}\n".format(cols[0], cols[4]))
        g2e.append("{}\tCHR_{}\n".format(cols[4], cols[0]))
        e2u.append("CHR_{}\t\n".format(cols[0]))
        g2u.append("{}\t\n".format(cols[4]))
    elif ctype == "alt-scaffold":
        continue
    else:
        print("can't handle {}".format(ctype))

f = open("GRCm38_ensembl2gencode.txt", "w")
for line in sorted(e2g):
    f.write(line)
f.close()

f = open("GRCm38_gencode2ensembl.txt", "w")
for line in sorted(g2e):
    f.write(line)
f.close()

f = open("GRCm38_UCSC2gencode.txt", "w")
for line in sorted(u2g):
    f.write(line)
f.close()

f = open("GRCm38_UCSC2ensembl.txt", "w")
for line in sorted(u2e):
    f.write(line)
f.close()

f = open("GRCm38_ensembl2UCSC.txt", "w")
for line in sorted(e2u):
    f.write(line)
f.close()

f = open("GRCm38_gencode2UCSC.txt", "w")
for line in sorted(g2u):
    f.write(line)
f.close()

ifile.close()
	#!/usr/bin/env python
	e2g = []
	e2u = []
	g2e = []
	g2u = []
	u2e = []
	u2g = []
	ifile = open("GCA_000001405.22_GRCh38.p7_assembly_report.txt", "r")
	for line in ifile:
	if line.startswith("#"):
	continue
	cols = line.strip().split("\t")
	ctype = cols[1]
	if ctype == "assembled-molecule":
	e2g.append("{}\t{}\n".format(cols[2], cols[9]))
	g2e.append("{}\t{}\n".format(cols[9], cols[2]))
	e2u.append("{}\t{}\n".format(cols[2], cols[9]))
	u2e.append("{}\t{}\n".format(cols[9], cols[2]))
	u2g.append("{}\t{}\n".format(cols[9], cols[9]))
	g2u.append("{}\t{}\n".format(cols[9], cols[9]))
	elif ctype == "unlocalized-scaffold" or ctype == "unplaced-scaffold":
	#UCSC 9
	#gencode/ensembl 4
	e2g.append("{}\t{}\n".format(cols[4], cols[4]))
	g2e.append("{}\t{}\n".format(cols[4], cols[4]))
	e2u.append("{}\t{}\n".format(cols[4], cols[9]))
	u2e.append("{}\t{}\n".format(cols[9], cols[4]))
	u2g.append("{}\t{}\n".format(cols[9], cols[4]))
	g2u.append("{}\t{}\n".format(cols[4], cols[9]))
	elif ctype in ["fix-patch", "novel-patch"]:
	#gencode 4
	#ensembl CHR_0
	e2g.append("CHR_{}\t{}\n".format(cols[0], cols[4]))
	g2e.append("{}\tCHR_{}\n".format(cols[4], cols[0]))
	e2u.append("CHR_{}\t\n".format(cols[0]))
	g2u.append("{}\t\n".format(cols[4]))
	elif ctype == "alt-scaffold":
	#UCSC 9
	#gencode 4
	#ensembl CHR_0
	e2g.append("CHR_{}\t{}\n".format(cols[0], cols[4]))
	g2e.append("{}\tCHR_{}\n".format(cols[4], cols[0]))
	e2u.append("CHR_{}\t{}\n".format(cols[0], cols[9]))
	u2e.append("{}\tCHR_{}\n".format(cols[9], cols[0]))
	u2g.append("{}\t{}\n".format(cols[9], cols[4]))
	g2u.append("{}\t{}\n".format(cols[4], cols[9]))
	else:
	print("can't handle {}".format(ctype))

	f = open("GRCh38_ensembl2gencode.txt", "w")
	for line in sorted(e2g, key=lambda x: x.lower()):
	f.write(line)
	f.close()

	f = open("GRCh38_gencode2ensembl.txt", "w")
	for line in sorted(g2e, key=lambda x: x.lower()):
	f.write(line)
	f.close()

	f = open("GRCh38_UCSC2gencode.txt", "w")
	for line in sorted(u2g, key=lambda x: x.lower()):
	f.write(line)
	f.close()

	f = open("GRCh38_UCSC2ensembl.txt", "w")
	for line in sorted(u2e, key=lambda x: x.lower()):
	f.write(line)
	f.close()

	f = open("GRCh38_ensembl2UCSC.txt", "w")
	for line in sorted(e2u, key=lambda x: x.lower()):
	f.write(line)
	f.close()

	f = open("GRCh38_gencode2UCSC.txt", "w")
	for line in sorted(g2u, key=lambda x: x.lower()):
	f.write(line)
	f.close()

	ifile.close()
	#!/usr/bin/env python
	e2g = [] #open("GRCm38_ensembl2gencode.txt", "w")
	e2u = [] #open("GRCm38_ensembl2UCSC.txt", "w")
	g2e = [] #open("GRCm38_gencode2ensembl.txt", "w")
	g2u = [] #open("GRCm38_gencode2UCSC.txt", "w")
	u2e = [] #open("GRCm38_UCSC2ensembl.txt", "w")
	u2g = [] #open("GRCm38_UCSC2gencode.txt", "w")
	ifile = open("GCA_000001635.6_GRCm38.p4_assembly_report.txt", "r")
	for line in ifile:
	if line.startswith("#"):
	continue
	cols = line.strip().split("\t")
	ctype = cols[1]
	if ctype == "assembled-molecule":
	e2g.append("{}\t{}\n".format(cols[2], cols[9]))
	g2e.append("{}\t{}\n".format(cols[9], cols[2]))
	e2u.append("{}\t{}\n".format(cols[2], cols[9]))
	u2e.append("{}\t{}\n".format(cols[9], cols[2]))
	u2g.append("{}\t{}\n".format(cols[9], cols[9]))
	g2u.append("{}\t{}\n".format(cols[9], cols[9]))
	elif ctype == "unlocalized-scaffold" or ctype == "unplaced-scaffold":
	#UCSC 9
	#gencode/ensembl 4
	e2g.append("{}\t{}\n".format(cols[4], cols[4]))
	g2e.append("{}\t{}\n".format(cols[4], cols[4]))
	e2u.append("{}\t{}\n".format(cols[4], cols[9]))
	u2e.append("{}\t{}\n".format(cols[9], cols[4]))
	u2g.append("{}\t{}\n".format(cols[9], cols[4]))
	g2u.append("{}\t{}\n".format(cols[4], cols[9]))
	elif ctype == "novel-patch" or ctype == "fix-patch":
	#UCSC ignore
	#gencode 4
	#ensembl CHR_0
	e2g.append("CHR_{}\t{}\n".format(cols[0], cols[4]))
	g2e.append("{}\tCHR_{}\n".format(cols[4], cols[0]))
	e2u.append("CHR_{}\t\n".format(cols[0]))
	g2u.append("{}\t\n".format(cols[4]))
	elif ctype == "alt-scaffold":
	continue
	else:
	print("can't handle {}".format(ctype))

	f = open("GRCm38_ensembl2gencode.txt", "w")
	for line in sorted(e2g):
	f.write(line)
	f.close()

	f = open("GRCm38_gencode2ensembl.txt", "w")
	for line in sorted(g2e):
	f.write(line)
	f.close()

	f = open("GRCm38_UCSC2gencode.txt", "w")
	for line in sorted(u2g):
	f.write(line)
	f.close()

	f = open("GRCm38_UCSC2ensembl.txt", "w")
	for line in sorted(u2e):
	f.write(line)
	f.close()

	f = open("GRCm38_ensembl2UCSC.txt", "w")
	for line in sorted(e2u):
	f.write(line)
	f.close()

	f = open("GRCm38_gencode2UCSC.txt", "w")
	for line in sorted(g2u):
	f.write(line)
	f.close()

	ifile.close()