Skip to content

Instantly share code, notes, and snippets.

@pgsin
Last active September 13, 2018 22:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pgsin/a4c175aedafac4295b5175aedfaf50fa to your computer and use it in GitHub Desktop.
Save pgsin/a4c175aedafac4295b5175aedfaf50fa to your computer and use it in GitHub Desktop.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
namespace ImmunopepModel
{
public class Gene
{
public string Id { private set; get; }
public string[] Proteins { private set; get; }
public string[] Sequences { private set; get; }
public double HalfLife { set; get; }
public double Expression { set; get; }
public PeptideRecord[] Peptides { set; get; }
public Gene(string id, string[] proteins, string[] sequences)
{
Id = id;
int imax = 0;
for (int i = 1; i < proteins.Length; i++)
{
if (sequences[imax].Length < sequences[i].Length) imax = i;
}
if (imax != 0)
{
string tmp;
tmp = sequences[imax];
sequences[imax] = sequences[0];
sequences[0] = tmp;
tmp = proteins[imax];
proteins[imax] = proteins[0];
proteins[0] = tmp;
}
Proteins = proteins;
Sequences = sequences;
HalfLife = 0.0;
Expression = 0.0;
}
}
public class FastaRecord
{
public string GeneId { private set; get; }
public string ProtId { private set; get; }
public string Sequence { private set; get; }
public static Regex ProtRegExp = new Regex(@">([^\s]*)");
public static Regex GeneRegExp = new Regex(@"gene:([^\s]*)");
public FastaRecord(string geneId, string protId, string sequence)
{
GeneId = geneId;
ProtId = protId;
Sequence = sequence;
}
}
public class PeptideRecord
{
public string Sequence { private set; get; }
public int[] Msms { private set; get; }
public double Score { private set; get; }
public PeptideRecord(string sequence, int[] msmscounts, double score)
{
Sequence = sequence;
Msms = msmscounts;
Score = score;
}
}
public class MasterMap
{
public char[] Seq { private set; get; }
public byte[] Cov { private set; get; }
public int[] Gene { private set; get; }
public int Len => Seq.Length;
public MasterMap(char[] seq, byte[] coverage, int[] gene)
{
Seq = seq;
Cov = coverage;
Gene = gene;
}
}
class Program
{
public const int MinLength = 8;
public const int MaxLength = 13;
public const int Flanking = 5;
static void Main(string[] args)
{
Random r = new Random(123);
string fasta_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\Homo_sapiens.GRCh38.pep.all.fa";
string transcript_rpkm_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\transcriptome.rpkm.txt";
string half_life_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\Savitski2018.processed.txt";
string peptides_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\peptides.txt";
string output_file = @"C:\Users\sinitcyn\Documents\Projects\Rudolph\modeling\peptides.validation.abelin.txt";
Dictionary<string, FastaRecord> fastaRecords = ReadFasta(fasta_file);
Dictionary<string, Gene> genes = new Dictionary<string, Gene>();
GetGene2Prots(fastaRecords, genes);
ReadTranscript(transcript_rpkm_file, genes);
ReadHalfLife(half_life_file, genes);
string[] hlaNames;
ReadPeptide(peptides_file, genes, fastaRecords, out hlaNames);
//UpdateHlaNames(hlaNames);
using (StreamWriter sw = new StreamWriter(output_file))
{
WriteHeader(sw);
int numRecords = WriteTargets(sw, genes, hlaNames, out HashSet<string> peptideSet);
WriteDecoys(sw, genes, hlaNames, 100 * numRecords, r, peptideSet);
}
}
private static void UpdateHlaNames(string[] hlaNames) {
for (int i = 0; i < hlaNames.Length; i++) {
string s = hlaNames[i];
hlaNames[i] = $"HLA-{s.Substring(0, 3)}:{s.Substring(3, 2)}";
}
}
private static void WriteHeader(StreamWriter sw)
{
sw.WriteLine(string.Join("\t",
new string[] {
"sequence", "index", "target(+) or decoy(-) abelin", "N-seq", "N-pos", "C-seq", "C-pos", "proteinId", "geneId",
"MHC identified", "MHC msms count", "Gene expression", "Mean Half-Life Savitski18"
}));
}
private static void WriteDecoys(StreamWriter sw, Dictionary<string, Gene> genes, string[] hlaNames, int numRecords, Random r,
HashSet<string> peptideSet)
{
List<KeyValuePair<Gene, int>>[] res = new List<KeyValuePair<Gene, int>>[MaxLength - MinLength + 1];
for (int i = 0; i <= MaxLength - MinLength; i++)
{
res[i] = new List<KeyValuePair<Gene, int>>();
}
foreach (var gene in genes)
{
string s = gene.Value.Sequences[0];
bool[] b = new bool[s.Length];
foreach (var peptide in gene.Value.Peptides)
{
int i = s.IndexOf(peptide.Sequence);
if (i != -1)
{
for (int j = 0; j < peptide.Sequence.Length; j++)
{
b[j + i] = true;
}
}
}
for (int i = 0; i <= b.Length - MinLength; i++)
{
for (int j = 0; j <= MaxLength - MinLength; j++)
{
if (i + j + MinLength > b.Length) break;
int sum = 0;
for (int k = i; k < i + j + MinLength; k++) if (b[k]) sum++;
if (sum > ((MinLength + j) >> 1)) continue;
res[j].Add(new KeyValuePair<Gene, int>(gene.Value, i));
}
}
}
double[][] hlaLenDist = GetLengthDist(genes); //lengths x hlas
KeyValuePair<Gene, int>[] result;
for (int i = 0; i < MaxLength - MinLength + 1; i++)
{
int pepLength = i + MinLength;
result = res[i].OrderBy(x => r.Next()).ToArray();
int l = 0;
for (int j = 0; j < hlaLenDist[i].Length; j++)
{
int until = (int)(hlaLenDist[i][j] * numRecords);
int k = 0;
for (; k < until; k++)
{
Gene g = result[l + k].Key;
int pos = result[l + k].Value;
string pepSeq = g.Sequences[0].Substring(pos, pepLength);
if (peptideSet.Contains(pepSeq) || pepSeq.IndexOf('U') != -1 )
{
until++;
continue;
}
else
{
peptideSet.Add(pepSeq);
}
int mi = Math.Max(0, pos - Flanking);
int f = pos + MinLength + i;
int ma = Math.Min(f + Flanking, g.Sequences[0].Length);
sw.WriteLine(
string.Join("\t",
new string[] {
pepSeq, "0", "-",
g.Sequences[0].Substring(mi, pos - mi), pos.ToString(),
g.Sequences[0].Substring(f, ma - f), (g.Sequences[0].Length-f).ToString(),
g.Proteins[0], g.Id,
hlaNames[j], "0",
String.Format("{0:0.000}", g.Expression), String.Format("{0:0.000}", g.HalfLife)
}));
}
l += k;
}
}
}
private static int WriteTargets(StreamWriter sw, Dictionary<string, Gene> genes, string[] expNames, out HashSet<string> peptideSet)
{
peptideSet = new HashSet<string>();
int count = 0;
foreach (var gene in genes)
{
foreach (var peptide in gene.Value.Peptides)
{
string proteinseq = gene.Value.Sequences[0];
string peptideseq = peptide.Sequence;
int i = proteinseq.IndexOf(peptideseq);
if (i == -1) continue;
int mi = Math.Max(0, i - Flanking);
int j = i + peptideseq.Length;
int ma = Math.Min(j + Flanking, proteinseq.Length);
List<string> mhc_identified = new List<string>();
List<string> mhc_msms_identified = new List<string>();
for (int k = 0; k < peptide.Msms.Length; k++)
{
if (peptide.Msms[k] > 0)
{
mhc_identified.Add(expNames[k]);
mhc_msms_identified.Add(peptide.Msms[k].ToString());
}
}
for (int k = 0; k < mhc_identified.Count; k++)
{
count++;
peptideSet.Add(peptideseq);
sw.WriteLine(
string.Join("\t",
new string[] {
peptideseq, k.ToString(), "+",
proteinseq.Substring(mi, i - mi), i.ToString(),
proteinseq.Substring(j, ma - j), (proteinseq.Length-j).ToString(),
gene.Value.Proteins[0], gene.Key,
mhc_identified[k], mhc_msms_identified[k],
String.Format("{0:0.000}", gene.Value.Expression), String.Format("{0:0.000}", gene.Value.HalfLife)
}));
}
}
}
return count;
}
private static double[][] GetLengthDist(Dictionary<string, Gene> genes)
{
int len = genes.Values.First().Peptides[0].Msms.Length;
int[][] result = new int[MaxLength - MinLength + 1][];
double[][] result0 = new double[MaxLength - MinLength + 1][];
for (int i = 0; i < result.Length; i++)
{
result[i] = new int[len];
result0[i] = new double[len];
}
foreach (var gene in genes)
{
foreach (var peptide in gene.Value.Peptides)
{
for (int i = 0; i < peptide.Msms.Length; i++)
{
if (peptide.Msms[i] != 0) result[peptide.Sequence.Length - MinLength][i]++;
}
}
}
int s = 0;
for (int i = 0; i < result.Length; i++)
{
for (int j = 0; j < result[i].Length; j++)
{
s += result[i][j];
}
}
for (int i = 0; i < result.Length; i++)
{
for (int j = 0; j < result[i].Length; j++)
{
result0[i][j] = (double)result[i][j] / s;
}
}
return result0;
}
private static void ReadPeptide(string peptides_file, Dictionary<string, Gene> genes, Dictionary<string, FastaRecord> fasta,
out string[] expNames)
{
Dictionary<string, List<PeptideRecord>> tmpd = new Dictionary<string, List<PeptideRecord>>();
List<PeptideRecord> tmpe;
using (StreamReader sr = new StreamReader(peptides_file))
{
string[] header = sr.ReadLine().Split('\t');
int sequenceIndex = Array.IndexOf(header, "Sequence");
int proteinsIndex = Array.IndexOf(header, "Proteins");
int scoreIndex = Array.IndexOf(header, "Score");
int reverseIndex = Array.IndexOf(header, "Reverse");
int potentialIndex = Array.IndexOf(header, "Potential contaminant");
Regex experimentRegExp = new Regex(@"Experiment (.*)");
List<int> experimentTmpIndex = new List<int>();
List<string> experimentTmpNames = new List<string>();
int blankIndex = -1;
int hlaNegIndex = -1;
for (int i = 0; i < header.Length; i++)
{
var x = experimentRegExp.Match(header[i]);
if (x.Success)
{
if (x.Groups[1].Value == "Blank")
{
blankIndex = i;
continue;
}
if (x.Groups[1].Value == "HLAneg")
{
hlaNegIndex = i;
continue;
}
experimentTmpIndex.Add(i);
experimentTmpNames.Add(x.Groups[1].Value);
}
}
int[] experimentIndex = experimentTmpIndex.ToArray();
expNames = experimentTmpNames.ToArray();
string line;
FastaRecord f;
Gene g;
while ((line = sr.ReadLine()) != null)
{
string[] l = line.Split('\t');
string[] proteins = l[proteinsIndex].Split(';');
Dictionary<string, Gene> genedict = new Dictionary<string, Gene>();
for (int i = 0; i < proteins.Length; i++)
if (fasta.TryGetValue(proteins[0], out f) && genes.TryGetValue(f.GeneId, out g) && !genedict.ContainsKey(f.GeneId))
genedict.Add(f.GeneId, g);
if (genedict.Count != 1) continue; //multiple-mapped peptide filter
g = genedict.Values.ToArray()[0];
if (l[reverseIndex] != "+" && l[potentialIndex] != "+" && //reverse and potential filter
MinLength <= l[sequenceIndex].Length && l[sequenceIndex].Length <= MaxLength && //length filter
string.IsNullOrEmpty(l[blankIndex]) && string.IsNullOrEmpty(l[hlaNegIndex]) && //negative control filter
g.Expression > 0.0) //expression filter
{
int[] msms = new int[experimentIndex.Length];
for (int i = 0; i < experimentIndex.Length; i++)
{
msms[i] = String.IsNullOrEmpty(l[experimentIndex[i]]) ? 0 : Convert.ToInt32(l[experimentIndex[i]]);
}
if (msms.Sum() <= 1) continue;
PeptideRecord pr = new PeptideRecord(l[sequenceIndex], msms, Convert.ToDouble(l[scoreIndex]));
if (tmpd.TryGetValue(g.Id, out tmpe))
tmpe.Add(pr);
else
tmpd.Add(g.Id, new List<PeptideRecord> { pr });
}
}
}
foreach (var i in tmpd)
{
genes[i.Key].Peptides = i.Value.ToArray();
}
List<string> rmList = new List<string>();
foreach (var i in genes)
if (i.Value.Peptides == null) rmList.Add(i.Key);
foreach (var i in rmList)
genes.Remove(i);
}
private static void ReadHalfLife(string halflife_file, Dictionary<string, Gene> genes)
{
Gene tmp;
using (StreamReader sr = new StreamReader(halflife_file))
{
string[] header = sr.ReadLine().Split('\t');
int geneId = Array.IndexOf(header, "Gene stable ID");
int mean = Array.IndexOf(header, "Mean");
string line;
while ((line = sr.ReadLine()) != null)
{
string[] l = line.Split('\t');
if (genes.TryGetValue(l[geneId], out tmp))
{
tmp.HalfLife = Convert.ToDouble(l[mean]);
}
}
}
}
private static void ReadTranscript(string transcript_file, Dictionary<string, Gene> genes)
{
using (StreamReader sr = new StreamReader(transcript_file))
{
string[] header = sr.ReadLine().Split('\t');
int geneId = Array.IndexOf(header, "Gene id");
int mean = Array.IndexOf(header, "Mean");
Gene g;
string line;
while ((line = sr.ReadLine()) != null)
{
string[] l = line.Split('\t');
if (genes.TryGetValue(l[geneId], out g))
{
g.Expression = Convert.ToDouble(l[mean]);
}
}
}
}
private static void GetGene2Prots(Dictionary<string, FastaRecord> fastaRecords, Dictionary<string, Gene> genes)
{
Dictionary<string, List<string>> tmp = new Dictionary<string, List<string>>();
List<string> tmpValue;
foreach (var record in fastaRecords)
{
if (tmp.TryGetValue(record.Value.GeneId, out tmpValue))
{
tmpValue.Add(record.Value.ProtId);
}
else
{
tmp.Add(record.Value.GeneId, new List<string> { record.Value.ProtId });
}
}
foreach (var t in tmp)
{
List<string> sequences = new List<string>();
foreach (string p in t.Value)
{
sequences.Add(fastaRecords[p].Sequence);
}
genes.Add(t.Key, new Gene(t.Key, t.Value.ToArray(), sequences.ToArray()));
}
}
private static Dictionary<string, FastaRecord> ReadFasta(string fasta_file)
{
Dictionary<string, FastaRecord> result = new Dictionary<string, FastaRecord>();
using (StreamReader sr = new StreamReader(fasta_file))
{
string line;
List<string> buffer = new List<string>();
while ((line = sr.ReadLine()) != null)
{
if (line[0] == '>' && buffer.Count != 0)
{
FastaRecord fr = new FastaRecord(FastaRecord.GeneRegExp.Match(buffer[0]).Groups[1].Value.Split('.')[0],
FastaRecord.ProtRegExp.Match(buffer[0]).Groups[1].Value.Split('.')[0],
string.Concat(buffer.Skip(1)));
if (!fr.Sequence.Contains('*') && fr.Sequence.Length > MaxLength)
{
result[fr.ProtId] = fr;
}
buffer.Clear();
}
buffer.Add(line);
}
}
return result;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment