Created
May 20, 2012 20:11
-
-
Save lsauer/2759399 to your computer and use it in GitHub Desktop.
C# / JavaScript - Splitting a string / sequence into parts of equal length ; split after n number of characters ; split sequence into GFF / FASTA like structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//www.lsauer.com 2012 | |
//FASTA linear sequence | |
"MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTYRTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKSPHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFITYLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVARGSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQMIMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFMLFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTMELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVIKKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGGSGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSGLETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSILGPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTIIYILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGYNSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKMRQKVAEICSSGLAENLHFESISFSW" | |
.split(/(.{50})/gm).filter(Boolean) | |
//there is also the neat CSS 'word-break:break-all;' - property and word-wrap, but neither will let you specify the exact amount of character to break to. | |
//result - useful for FASTA splitting | |
["MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTY", "RTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKS", | |
"PHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFIT", "YLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVAR", | |
"GSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQM", "IMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFM", | |
"LFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTM", "ELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVI", | |
"KKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGG", "SGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSG", | |
"LETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSIL", "GPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTII", | |
"YILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGY", "NSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKM", | |
"RQKVAEICSSGLAENLHFESISFSW"] | |
//in GFF / GPFF (formatting | |
"MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTYRTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKSPHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFITYLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVARGSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQMIMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFMLFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTMELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVIKKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGGSGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSGLETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSILGPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTIIYILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGYNSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKMRQKVAEICSSGLAENLHFESISFSW" | |
.split(/(.{10})/gm).filter(Boolean).map( function(e,i,a){ var pos=(i*10)+1; return (!(i%6)?'\n'+' '.slice(0,4-(''+pos).length)+(pos)+' '+e:e)}).join(' ') | |
//>result | |
1 MEIEKSNNGG SNPSAGEEFK DMIKGVTKFL MMVIFLGTIM LWIMMPTLTY RTKWLPHLRI | |
61 KFGTSTYFGA TGTTLFMYMF PMMVVACLGC VYLHFKNRKS PHHIDRETKG GVWSKLRKPM | |
121 LVKGPLGIVS VTEITFLAMF VALLLWCFIT YLRNSFATIT PKSAAAHDES LWQAKLESAA | |
181 LRLGLIGNIC LAFLFLPVAR GSSLLPAMGL TSESSIKYHI WLGHMVMALF TVHGLCYIIY | |
241 WASMHEISQM IMWDTKGVSN LAGEIALAAG LVMWATTYPK IRRRFFEVFF YTHYLYIVFM | |
301 LFFVLHVGIS FSFIALPGFY IFLVDRFLRF LQSRENVRLL AARILPSDTM ELTFSKNSKL | |
361 VYSPTSIMFV NIPSISKLQW HPFTITSSSK LEPEKLSIVI KKEGKWSTKL HQRLSSSDQI | |
421 DRLAVSVEGP YGPASADFLR HEALVMVCGG SGITPFISVI RDLIATSQKE TCKIPKITLI | |
481 CAFKKSSEIS MLDLVLPLSG LETELSSDIN IKIEAFITRD NDAGDEAKAG KIKTLWFKPS | |
541 LSDQSISSIL GPNSWLWLGA ILASSFLIFM IIIGIITRYY IYPIDHNTNK IYSLTSKTII | |
601 YILVISVSIM ATCSAAMLWN KKKYGKVESK QVQNVDRPSP TSSPTSSWGY NSLREIESTP | |
661 QESLVQRTNL HFGERPNLKK LLLDVEGSSV GVLVCGPKKM RQKVAEICSS GLAENLHFES | |
721 ISFSW |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//lo sauer, 2012 - free use | |
//analog solution in C# and using LINQ for filtering empty elements | |
using System; | |
using System.Text.RegularExpressions; | |
Regex.Split( | |
"MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTYRTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKSPHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFITYLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVARGSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQMIMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFMLFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTMELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVIKKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGGSGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSGLETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSILGPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTIIYILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGYNSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKMRQKVAEICSSGLAENLHFESISFSW" | |
, "(.{50})" | |
, RegexOptions.Multiline) | |
.Where(s => !string.IsNullOrEmpty(s)) | |
.ToArray(); | |
//> Result (courtesy of the great REPL Mono CS Shell2: http://www.mono-project.com/CsharpRepl ) | |
{ "MEIEKSNNGGSNPSAGEEFKDMIKGVTKFLMMVIFLGTIMLWIMMPTLTY", "RTKWLPHLRIKFGTSTYFGATGTTLFMYMFPMMVVACLGCVYLHFKNRKS", | |
"PHHIDRETKGGVWSKLRKPMLVKGPLGIVSVTEITFLAMFVALLLWCFIT", "YLRNSFATITPKSAAAHDESLWQAKLESAALRLGLIGNICLAFLFLPVAR", | |
"GSSLLPAMGLTSESSIKYHIWLGHMVMALFTVHGLCYIIYWASMHEISQM", "IMWDTKGVSNLAGEIALAAGLVMWATTYPKIRRRFFEVFFYTHYLYIVFM", | |
"LFFVLHVGISFSFIALPGFYIFLVDRFLRFLQSRENVRLLAARILPSDTM", "ELTFSKNSKLVYSPTSIMFVNIPSISKLQWHPFTITSSSKLEPEKLSIVI", | |
"KKEGKWSTKLHQRLSSSDQIDRLAVSVEGPYGPASADFLRHEALVMVCGG", "SGITPFISVIRDLIATSQKETCKIPKITLICAFKKSSEISMLDLVLPLSG", | |
"LETELSSDINIKIEAFITRDNDAGDEAKAGKIKTLWFKPSLSDQSISSIL", "GPNSWLWLGAILASSFLIFMIIIGIITRYYIYPIDHNTNKIYSLTSKTII", | |
"YILVISVSIMATCSAAMLWNKKKYGKVESKQVQNVDRPSPTSSPTSSWGY", "NSLREIESTPQESLVQRTNLHFGERPNLKKLLLDVEGSSVGVLVCGPKKM", | |
"RQKVAEICSSGLAENLHFESISFSW" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment