disulfidebond/parse_mhc_workflow_pt2.md

## parse_mhc_workflow_pt2.md

      
    Raw
  

              parse_mhc_workflow_pt2.md
            
          
    Description of Workflow

Step 2: Use python file to parse the filtered output from part 1 (most likely the file named 'parsed_mhc_output.txt'). The python script uses argparse(), so it can be called from the commandline with one of the following options, but note that output is to STDOUT, so it will need to be redirected to a file.


exonList -> a text file with a comma separated list of exons


fastaExonList -> a fasta formatted list of exons


mergedFastaList -> a fasta formatted list of cDNA sequences
          #!/usr/bin/python3
          import argparse
          import time

          def parsedAndCastRange(xList):
              # requires: string in the format '1..5', returns [1,5] as list of ints
              unparsedRangeAsList = xList.split('.')
              parsedRange = list(filter(lambda x: x!= '', unparsedRangeAsList))
              return [int(parsedRange[0]), int(parsedRange[1])]

          def parsedExonOutput(s):
              retList = []
              for i in s:
                  listOfValues = i
                  exonSeq = i[-1]
                  exon_ct = []
                  exon_ct_list = []
                  mxLen = len(listOfValues[3])
                  for j in listOfValues[3]:
                      exon_ct.append(j[0])
                      exon_ct_list.append((j[1][0], j[1][1]))
                  # indexTransition = 0
                  indexTransition = 0
                  checkForIntron = 0
                  for x in range(0, mxLen, 1):
                      exonStartAndStop = exon_ct_list[x]
                      exonStart = exonStartAndStop[0]
                      if x == 0:
                          exonStart = exonStartAndStop[0] - 1
                      exonStop = exonStartAndStop[1] + 1
                      exonSequence = exonSeq[exonStart:exonStop]
                      indexTransition = exonStartAndStop[1]+1
                      r = str(listOfValues[0]) + ':' + str(exon_ct[x]) + ',' + exonSequence
                      retList.append(r)
              return retList


          if __name__ == "__main__":
              parser = argparse.ArgumentParser(description='parser for MHC file from EMBL')
              parser.add_argument('-f', '--filteredfile', help='filtered file from EMBL', required=True)
              parser.add_argument('outputType', choices=['exonList', 'fastaExonList', 'mergedFastaList'], help='Type one of three output options without quotes: \'exonList\' outputs a list with Allele:exon,exonsequence ; \'fastaExonList\' outputs a fasta file split into exons ; \'mergedFastaList\' outputs a fasta file with the merged exons for each allele and the header name of the allele')
              args = parser.parse_args()


              alleleList = []
              with open(args.filteredfile, 'r') as f:
                  s_id = ''
                  s_alleleID = ''
                  s_exonList = []
                  alleleExonRange = []
                  alleleExonSeq = []
                  exonSeqs = ''
                  exonCt = 1
                  seqFlag = False
                  for i in f:
                      i = i.rstrip('\r\n')
                      sLine = i.split(' ')
                      sLineList = list(filter(lambda x: x != '', sLine))
                      if i[0:3] == 'KW':
                          s_id = sLineList[1]
                      elif i[0:3] == 'ID':
                          s_alleleID = sLineList[1]
                      elif i[0:3] == 'FT':
                          if sLineList[1] == 'allele':
                              alleleExonRange = parsedAndCastRange(sLineList[2])
                          elif sLineList[1] == 'exon':
                              exonSeqRange = parsedAndCastRange(sLineList[2])
                              exonN = 'exon' + str(exonCt)
                              alleleExonSeq.append((exonN, exonSeqRange))
                              exonCt += 1
                          else:
                              continue
                      elif i[0:3] == 'SQ':
                          seqFlag = True
                          continue
                      else:
                          if seqFlag:
                              if i[0:2] == '##':
                                  # modify this as necessary.  It is unknown why python string indices 
                                  # are completely wacky--possibly something Bash does?
                                  # be advised you may need to change the range from 0:3 
                                  # or change the string range in the parsing above to 0:2 
                                  seqFlag = False
                                  alleleList.append([s_id, s_alleleID, alleleExonRange, alleleExonSeq, exonSeqs])
                                  alleleExonSeq = []
                                  exonCt = 1
                                  exonSeqs = ''
                                  continue
                              else:
                                  s = ''.join(sLineList[:-1])
                                  exonSeqs += s
                          else:
                              continue

              if args.outputType == 'exonList':
                  res = parsedExonOutput(alleleList)
                  for i in res:
                      print(i)
              else:
                  if args.outputType == 'mergedFastaList':
                      for i in alleleList:
                          hString = '>' + str(i[0]) + ' ' + str(i[1])
                          seqString = str(i[-1])
                          print(hString + '\n' + seqString)
                  else:
                      res = parsedExonOutput(alleleList)
                      for i in res:
                          iSplit = i.split(',')
                          hString = '>' + str(iSplit[0])
                          seqString = str(iSplit[1])
                          print(hString + '\n' + seqString)