using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.RegularExpressions; | |
namespace ImmunopepModel | |
{ | |
public class Gene | |
{ | |
public string Id { private set; get; } | |
public string[] Proteins { private set; get; } | |
public string[] Sequences { private set; get; } | |
public double HalfLife { set; get; } | |
public double Expression { set; get; } | |
public PeptideRecord[] Peptides { set; get; } | |
public Gene(string id, string[] proteins, string[] sequences) | |
{ | |
Id = id; | |
int imax = 0; | |
for (int i = 1; i < proteins.Length; i++) | |
{ | |
if (sequences[imax].Length < sequences[i].Length) imax = i; | |
} | |
if (imax != 0) | |
{ | |
string tmp; | |
tmp = sequences[imax]; | |
sequences[imax] = sequences[0]; | |
sequences[0] = tmp; | |
tmp = proteins[imax]; | |
proteins[imax] = proteins[0]; | |
proteins[0] = tmp; | |
} | |
Proteins = proteins; | |
Sequences = sequences; | |
HalfLife = 0.0; | |
Expression = 0.0; | |
} | |
} | |
public class FastaRecord | |
{ | |
public string GeneId { private set; get; } | |
public string ProtId { private set; get; } | |
public string Sequence { private set; get; } | |
public static Regex ProtRegExp = new Regex(@">([^\s]*)"); | |
public static Regex GeneRegExp = new Regex(@"gene:([^\s]*)"); | |
public FastaRecord(string geneId, string protId, string sequence) | |
{ | |
GeneId = geneId; | |
ProtId = protId; | |
Sequence = sequence; | |
} | |
} | |
public class PeptideRecord | |
{ | |
public string Sequence { private set; get; } | |
public int[] Msms { private set; get; } | |
public double Score { private set; get; } | |
public PeptideRecord(string sequence, int[] msmscounts, double score) | |
{ | |
Sequence = sequence; | |
Msms = msmscounts; | |
Score = score; | |
} | |
} | |
public class MasterMap | |
{ | |
public char[] Seq { private set; get; } | |
public byte[] Cov { private set; get; } | |
public int[] Gene { private set; get; } | |
public int Len => Seq.Length; | |
public MasterMap(char[] seq, byte[] coverage, int[] gene) | |
{ | |
Seq = seq; | |
Cov = coverage; | |
Gene = gene; | |
} | |
} | |
class Program | |
{ | |
public const int MinLength = 8; | |
public const int MaxLength = 13; | |
public const int Flanking = 5; | |
static void Main(string[] args) | |
{ | |
Random r = new Random(123); | |
string fasta_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\Homo_sapiens.GRCh38.pep.all.fa"; | |
string transcript_rpkm_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\transcriptome.rpkm.txt"; | |
string half_life_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\Savitski2018.processed.txt"; | |
string peptides_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\peptides.txt"; | |
string output_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\peptides.validation.abelin.txt"; | |
Dictionary<string, FastaRecord> fastaRecords = ReadFasta(fasta_file); | |
Dictionary<string, Gene> genes = new Dictionary<string, Gene>(); | |
GetGene2Prots(fastaRecords, genes); | |
ReadTranscript(transcript_rpkm_file, genes); | |
ReadHalfLife(half_life_file, genes); | |
string[] hlaNames; | |
ReadPeptide(peptides_file, genes, fastaRecords, out hlaNames); | |
//UpdateHlaNames(hlaNames); | |
using (StreamWriter sw = new StreamWriter(output_file)) | |
{ | |
WriteHeader(sw); | |
int numRecords = WriteTargets(sw, genes, hlaNames, out HashSet<string> peptideSet); | |
WriteDecoys(sw, genes, hlaNames, 100 * numRecords, r, peptideSet); | |
} | |
} | |
private static void UpdateHlaNames(string[] hlaNames) { | |
for (int i = 0; i < hlaNames.Length; i++) { | |
string s = hlaNames[i]; | |
hlaNames[i] = $"HLA-{s.Substring(0, 3)}:{s.Substring(3, 2)}"; | |
} | |
} | |
private static void WriteHeader(StreamWriter sw) | |
{ | |
sw.WriteLine(string.Join("\t", | |
new string[] { | |
"sequence", "index", "target(+) or decoy(-) abelin", "N-seq", "N-pos", "C-seq", "C-pos", "proteinId", "geneId", | |
"MHC identified", "MHC msms count", "Gene expression", "Mean Half-Life Savitski18" | |
})); | |
} | |
private static void WriteDecoys(StreamWriter sw, Dictionary<string, Gene> genes, string[] hlaNames, int numRecords, Random r, | |
HashSet<string> peptideSet) | |
{ | |
List<KeyValuePair<Gene, int>>[] res = new List<KeyValuePair<Gene, int>>[MaxLength - MinLength + 1]; | |
for (int i = 0; i <= MaxLength - MinLength; i++) | |
{ | |
res[i] = new List<KeyValuePair<Gene, int>>(); | |
} | |
foreach (var gene in genes) | |
{ | |
string s = gene.Value.Sequences[0]; | |
bool[] b = new bool[s.Length]; | |
foreach (var peptide in gene.Value.Peptides) | |
{ | |
int i = s.IndexOf(peptide.Sequence); | |
if (i != -1) | |
{ | |
for (int j = 0; j < peptide.Sequence.Length; j++) | |
{ | |
b[j + i] = true; | |
} | |
} | |
} | |
for (int i = 0; i <= b.Length - MinLength; i++) | |
{ | |
for (int j = 0; j <= MaxLength - MinLength; j++) | |
{ | |
if (i + j + MinLength > b.Length) break; | |
int sum = 0; | |
for (int k = i; k < i + j + MinLength; k++) if (b[k]) sum++; | |
if (sum > ((MinLength + j) >> 1)) continue; | |
res[j].Add(new KeyValuePair<Gene, int>(gene.Value, i)); | |
} | |
} | |
} | |
double[][] hlaLenDist = GetLengthDist(genes); //lengths x hlas | |
KeyValuePair<Gene, int>[] result; | |
for (int i = 0; i < MaxLength - MinLength + 1; i++) | |
{ | |
int pepLength = i + MinLength; | |
result = res[i].OrderBy(x => r.Next()).ToArray(); | |
int l = 0; | |
for (int j = 0; j < hlaLenDist[i].Length; j++) | |
{ | |
int until = (int)(hlaLenDist[i][j] * numRecords); | |
int k = 0; | |
for (; k < until; k++) | |
{ | |
Gene g = result[l + k].Key; | |
int pos = result[l + k].Value; | |
string pepSeq = g.Sequences[0].Substring(pos, pepLength); | |
if (peptideSet.Contains(pepSeq) || pepSeq.IndexOf('U') != -1 ) | |
{ | |
until++; | |
continue; | |
} | |
else | |
{ | |
peptideSet.Add(pepSeq); | |
} | |
int mi = Math.Max(0, pos - Flanking); | |
int f = pos + MinLength + i; | |
int ma = Math.Min(f + Flanking, g.Sequences[0].Length); | |
sw.WriteLine( | |
string.Join("\t", | |
new string[] { | |
pepSeq, "0", "-", | |
g.Sequences[0].Substring(mi, pos - mi), pos.ToString(), | |
g.Sequences[0].Substring(f, ma - f), (g.Sequences[0].Length-f).ToString(), | |
g.Proteins[0], g.Id, | |
hlaNames[j], "0", | |
String.Format("{0:0.000}", g.Expression), String.Format("{0:0.000}", g.HalfLife) | |
})); | |
} | |
l += k; | |
} | |
} | |
} | |
private static int WriteTargets(StreamWriter sw, Dictionary<string, Gene> genes, string[] expNames, out HashSet<string> peptideSet) | |
{ | |
peptideSet = new HashSet<string>(); | |
int count = 0; | |
foreach (var gene in genes) | |
{ | |
foreach (var peptide in gene.Value.Peptides) | |
{ | |
string proteinseq = gene.Value.Sequences[0]; | |
string peptideseq = peptide.Sequence; | |
int i = proteinseq.IndexOf(peptideseq); | |
if (i == -1) continue; | |
int mi = Math.Max(0, i - Flanking); | |
int j = i + peptideseq.Length; | |
int ma = Math.Min(j + Flanking, proteinseq.Length); | |
List<string> mhc_identified = new List<string>(); | |
List<string> mhc_msms_identified = new List<string>(); | |
for (int k = 0; k < peptide.Msms.Length; k++) | |
{ | |
if (peptide.Msms[k] > 0) | |
{ | |
mhc_identified.Add(expNames[k]); | |
mhc_msms_identified.Add(peptide.Msms[k].ToString()); | |
} | |
} | |
for (int k = 0; k < mhc_identified.Count; k++) | |
{ | |
count++; | |
peptideSet.Add(peptideseq); | |
sw.WriteLine( | |
string.Join("\t", | |
new string[] { | |
peptideseq, k.ToString(), "+", | |
proteinseq.Substring(mi, i - mi), i.ToString(), | |
proteinseq.Substring(j, ma - j), (proteinseq.Length-j).ToString(), | |
gene.Value.Proteins[0], gene.Key, | |
mhc_identified[k], mhc_msms_identified[k], | |
String.Format("{0:0.000}", gene.Value.Expression), String.Format("{0:0.000}", gene.Value.HalfLife) | |
})); | |
} | |
} | |
} | |
return count; | |
} | |
private static double[][] GetLengthDist(Dictionary<string, Gene> genes) | |
{ | |
int len = genes.Values.First().Peptides[0].Msms.Length; | |
int[][] result = new int[MaxLength - MinLength + 1][]; | |
double[][] result0 = new double[MaxLength - MinLength + 1][]; | |
for (int i = 0; i < result.Length; i++) | |
{ | |
result[i] = new int[len]; | |
result0[i] = new double[len]; | |
} | |
foreach (var gene in genes) | |
{ | |
foreach (var peptide in gene.Value.Peptides) | |
{ | |
for (int i = 0; i < peptide.Msms.Length; i++) | |
{ | |
if (peptide.Msms[i] != 0) result[peptide.Sequence.Length - MinLength][i]++; | |
} | |
} | |
} | |
int s = 0; | |
for (int i = 0; i < result.Length; i++) | |
{ | |
for (int j = 0; j < result[i].Length; j++) | |
{ | |
s += result[i][j]; | |
} | |
} | |
for (int i = 0; i < result.Length; i++) | |
{ | |
for (int j = 0; j < result[i].Length; j++) | |
{ | |
result0[i][j] = (double)result[i][j] / s; | |
} | |
} | |
return result0; | |
} | |
private static void ReadPeptide(string peptides_file, Dictionary<string, Gene> genes, Dictionary<string, FastaRecord> fasta, | |
out string[] expNames) | |
{ | |
Dictionary<string, List<PeptideRecord>> tmpd = new Dictionary<string, List<PeptideRecord>>(); | |
List<PeptideRecord> tmpe; | |
using (StreamReader sr = new StreamReader(peptides_file)) | |
{ | |
string[] header = sr.ReadLine().Split('\t'); | |
int sequenceIndex = Array.IndexOf(header, "Sequence"); | |
int proteinsIndex = Array.IndexOf(header, "Proteins"); | |
int scoreIndex = Array.IndexOf(header, "Score"); | |
int reverseIndex = Array.IndexOf(header, "Reverse"); | |
int potentialIndex = Array.IndexOf(header, "Potential contaminant"); | |
Regex experimentRegExp = new Regex(@"Experiment (.*)"); | |
List<int> experimentTmpIndex = new List<int>(); | |
List<string> experimentTmpNames = new List<string>(); | |
int blankIndex = -1; | |
int hlaNegIndex = -1; | |
for (int i = 0; i < header.Length; i++) | |
{ | |
var x = experimentRegExp.Match(header[i]); | |
if (x.Success) | |
{ | |
if (x.Groups[1].Value == "Blank") | |
{ | |
blankIndex = i; | |
continue; | |
} | |
if (x.Groups[1].Value == "HLAneg") | |
{ | |
hlaNegIndex = i; | |
continue; | |
} | |
experimentTmpIndex.Add(i); | |
experimentTmpNames.Add(x.Groups[1].Value); | |
} | |
} | |
int[] experimentIndex = experimentTmpIndex.ToArray(); | |
expNames = experimentTmpNames.ToArray(); | |
string line; | |
FastaRecord f; | |
Gene g; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
string[] l = line.Split('\t'); | |
string[] proteins = l[proteinsIndex].Split(';'); | |
Dictionary<string, Gene> genedict = new Dictionary<string, Gene>(); | |
for (int i = 0; i < proteins.Length; i++) | |
if (fasta.TryGetValue(proteins[0], out f) && genes.TryGetValue(f.GeneId, out g) && !genedict.ContainsKey(f.GeneId)) | |
genedict.Add(f.GeneId, g); | |
if (genedict.Count != 1) continue; //multiple-mapped peptide filter | |
g = genedict.Values.ToArray()[0]; | |
if (l[reverseIndex] != "+" && l[potentialIndex] != "+" && //reverse and potential filter | |
MinLength <= l[sequenceIndex].Length && l[sequenceIndex].Length <= MaxLength && //length filter | |
string.IsNullOrEmpty(l[blankIndex]) && string.IsNullOrEmpty(l[hlaNegIndex]) && //negative control filter | |
g.Expression > 0.0) //expression filter | |
{ | |
int[] msms = new int[experimentIndex.Length]; | |
for (int i = 0; i < experimentIndex.Length; i++) | |
{ | |
msms[i] = String.IsNullOrEmpty(l[experimentIndex[i]]) ? 0 : Convert.ToInt32(l[experimentIndex[i]]); | |
} | |
if (msms.Sum() <= 1) continue; | |
PeptideRecord pr = new PeptideRecord(l[sequenceIndex], msms, Convert.ToDouble(l[scoreIndex])); | |
if (tmpd.TryGetValue(g.Id, out tmpe)) | |
tmpe.Add(pr); | |
else | |
tmpd.Add(g.Id, new List<PeptideRecord> { pr }); | |
} | |
} | |
} | |
foreach (var i in tmpd) | |
{ | |
genes[i.Key].Peptides = i.Value.ToArray(); | |
} | |
List<string> rmList = new List<string>(); | |
foreach (var i in genes) | |
if (i.Value.Peptides == null) rmList.Add(i.Key); | |
foreach (var i in rmList) | |
genes.Remove(i); | |
} | |
private static void ReadHalfLife(string halflife_file, Dictionary<string, Gene> genes) | |
{ | |
Gene tmp; | |
using (StreamReader sr = new StreamReader(halflife_file)) | |
{ | |
string[] header = sr.ReadLine().Split('\t'); | |
int geneId = Array.IndexOf(header, "Gene stable ID"); | |
int mean = Array.IndexOf(header, "Mean"); | |
string line; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
string[] l = line.Split('\t'); | |
if (genes.TryGetValue(l[geneId], out tmp)) | |
{ | |
tmp.HalfLife = Convert.ToDouble(l[mean]); | |
} | |
} | |
} | |
} | |
private static void ReadTranscript(string transcript_file, Dictionary<string, Gene> genes) | |
{ | |
using (StreamReader sr = new StreamReader(transcript_file)) | |
{ | |
string[] header = sr.ReadLine().Split('\t'); | |
int geneId = Array.IndexOf(header, "Gene id"); | |
int mean = Array.IndexOf(header, "Mean"); | |
Gene g; | |
string line; | |
while ((line = sr.ReadLine()) != null) | |
{ | |
string[] l = line.Split('\t'); | |
if (genes.TryGetValue(l[geneId], out g)) | |
{ | |
g.Expression = Convert.ToDouble(l[mean]); | |
} | |
} | |
} | |
} | |
private static void GetGene2Prots(Dictionary<string, FastaRecord> fastaRecords, Dictionary<string, Gene> genes) | |
{ | |
Dictionary<string, List<string>> tmp = new Dictionary<string, List<string>>(); | |
List<string> tmpValue; | |
foreach (var record in fastaRecords) | |
{ | |
if (tmp.TryGetValue(record.Value.GeneId, out tmpValue)) | |
{ | |
tmpValue.Add(record.Value.ProtId); | |
} | |
else | |
{ | |
tmp.Add(record.Value.GeneId, new List<string> { record.Value.ProtId }); | |
} | |
} | |
foreach (var t in tmp) | |
{ | |
List<string> sequences = new List<string>(); | |
foreach (string p in t.Value) | |
{ | |
sequences.Add(fastaRecords[p].Sequence); | |
} | |
genes.Add(t.Key, new Gene(t.Key, t.Value.ToArray(), sequences.ToArray())); | |
} | |
} | |
private static Dictionary<string, FastaRecord> ReadFasta(string fasta_file) | |
{ | |
Dictionary<string, FastaRecord> result = new Dictionary<string, FastaRecord>(); | |
using (StreamReader sr = new StreamReader(fasta_file)) | |
{ | |
string line; | |
List<string> buffer = new List<string>(); | |
while ((line = sr.ReadLine()) != null) | |
{ | |
if (line[0] == '>' && buffer.Count != 0) | |
{ | |
FastaRecord fr = new FastaRecord(FastaRecord.GeneRegExp.Match(buffer[0]).Groups[1].Value.Split('.')[0], | |
FastaRecord.ProtRegExp.Match(buffer[0]).Groups[1].Value.Split('.')[0], | |
string.Concat(buffer.Skip(1))); | |
if (!fr.Sequence.Contains('*') && fr.Sequence.Length > MaxLength) | |
{ | |
result[fr.ProtId] = fr; | |
} | |
buffer.Clear(); | |
} | |
buffer.Add(line); | |
} | |
} | |
return result; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment