Skip to content

Instantly share code, notes, and snippets.

@lsauer
Created May 20, 2012 20:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lsauer/2759399 to your computer and use it in GitHub Desktop.
Save lsauer/2759399 to your computer and use it in GitHub Desktop.
C# / JavaScript - Splitting a string / sequence into parts of equal length ; split after n number of characters ; split sequence into GFF / FASTA like structure
//www.lsauer.com 2012
//FASTA linear sequence
"MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTYRTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKSPHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFITYLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVARGSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQMIMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFMLFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTMELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVIKKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGGSGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSGLETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSILGPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTIIYILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGYNSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKMRQKVAEICSSGLAENLHFESISFSW"
.split(/(.{50})/gm).filter(Boolean)
//there is also the neat CSS 'word-break:break-all;' - property and word-wrap, but neither will let you specify the exact amount of character to break to.
//result - useful for FASTA splitting
["MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTY", "RTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKS",
"PHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFIT", "YLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVAR",
"GSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQM", "IMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFM",
"LFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTM", "ELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVI",
"KKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGG", "SGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSG",
"LETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSIL", "GPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTII",
"YILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGY", "NSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKM",
"RQKVAEICSSGLAENLHFESISFSW"]
//in GFF / GPFF (formatting
"MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTYRTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKSPHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFITYLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVARGSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQMIMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFMLFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTMELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVIKKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGGSGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSGLETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSILGPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTIIYILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGYNSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKMRQKVAEICSSGLAENLHFESISFSW"
.split(/(.{10})/gm).filter(Boolean).map( function(e,i,a){ var pos=(i*10)+1; return (!(i%6)?'\n'+' '.slice(0,4-(''+pos).length)+(pos)+' '+e:e)}).join(' ')
//>result
1 MEIEKSNNGG SNPSAGEEFK DMIKGVTKFL MMVIFLGTIM LWIMMPTLTY RTKWLPHLRI
61 KFGTSTYFGA TGTTLFMYMF PMMVVACLGC VYLHFKNRKS PHHIDRETKG GVWSKLRKPM
121 LVKGPLGIVS VTEITFLAMF VALLLWCFIT YLRNSFATIT PKSAAAHDES LWQAKLESAA
181 LRLGLIGNIC LAFLFLPVAR GSSLLPAMGL TSESSIKYHI WLGHMVMALF TVHGLCYIIY
241 WASMHEISQM IMWDTKGVSN LAGEIALAAG LVMWATTYPK IRRRFFEVFF YTHYLYIVFM
301 LFFVLHVGIS FSFIALPGFY IFLVDRFLRF LQSRENVRLL AARILPSDTM ELTFSKNSKL
361 VYSPTSIMFV NIPSISKLQW HPFTITSSSK LEPEKLSIVI KKEGKWSTKL HQRLSSSDQI
421 DRLAVSVEGP YGPASADFLR HEALVMVCGG SGITPFISVI RDLIATSQKE TCKIPKITLI
481 CAFKKSSEIS MLDLVLPLSG LETELSSDIN IKIEAFITRD NDAGDEAKAG KIKTLWFKPS
541 LSDQSISSIL GPNSWLWLGA ILASSFLIFM IIIGIITRYY IYPIDHNTNK IYSLTSKTII
601 YILVISVSIM ATCSAAMLWN KKKYGKVESK QVQNVDRPSP TSSPTSSWGY NSLREIESTP
661 QESLVQRTNL HFGERPNLKK LLLDVEGSSV GVLVCGPKKM RQKVAEICSS GLAENLHFES
721 ISFSW
//lo sauer, 2012 - free use
//analog solution in C# and using LINQ for filtering empty elements
using System;
using System.Text.RegularExpressions;
Regex.Split(
"MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTYRTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKSPHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFITYLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVARGSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQMIMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFMLFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTMELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVIKKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGGSGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSGLETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSILGPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTIIYILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGYNSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKMRQKVAEICSSGLAENLHFESISFSW"
, "(.{50})"
, RegexOptions.Multiline)
.Where(s => !string.IsNullOrEmpty(s))
.ToArray();
//> Result (courtesy of the great REPL Mono CS Shell2: http://www.mono-project.com/CsharpRepl )
{ "MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTY", "RTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKS",
"PHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFIT", "YLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVAR",
"GSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQM", "IMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFM",
"LFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTM", "ELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVI",
"KKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGG", "SGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSG",
"LETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSIL", "GPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTII",
"YILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGY", "NSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKM",
"RQKVAEICSSGLAENLHFESISFSW"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment