Skip to content

Instantly share code, notes, and snippets.

@lindenb
Last active June 15, 2016 19:18
Show Gist options
  • Save lindenb/6090786 to your computer and use it in GitHub Desktop.
Save lindenb/6090786 to your computer and use it in GitHub Desktop.
comparing the md5 sum of g1kv37 vs hg19
$ curl -s "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz" | gunzip -c | java FastaMD5 > a.txt
$ curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz" | gunzip -c | tar Oxvf - 2> /dev/null | java FastaMD5 > b.txt
##join
$ join -t ' ' -1 2 -2 2 <(sort -t ' ' -k2,2 a.txt ) <(sort -t ' ' -k2,2 b.txt ) | cut -d ' ' -f 1,2,4 | sort -t ' ' -k3,3
#unjoinable
$ join -t ' ' -1 2 -2 2 -v 1 -v 2 <(sort -t ' ' -k2,2 a.txt ) <(sort -t ' ' -k2,2 b.txt ) | sort -t ' ' -k2,2
import java.io.*;
import java.security.MessageDigest;
public class FastaMD5
{
public static void main(String args[]) throws Exception
{
int len=0;
byte[] buffer = new byte[1];
MessageDigest complete = null;
for(;;)
{
int c=System.in.read();
switch(c)
{
case -1: case '>':
{
if(complete!=null)
{
for(byte b:complete.digest())
{
System.out.print(Integer.toString( (b & 0xff ) + 0x100, 16).substring( 1 ));
}
System.out.println("\t"+len);
complete=null;
len=0;
}
if(c==-1) return;
while((c=System.in.read())!=-1 && c!='\n') System.out.print((char)c);
System.out.print('\t');
complete=MessageDigest.getInstance("MD5");
len=0;
break;
}
case '\n':case ' ':case '\r': break;
default:
{
buffer[0]=(byte)Character.toUpperCase(c);
complete.update(buffer, 0, 1);
++len;
break;
}
}
}
}
}
1b22b98cdeb4a9304cb5d48026a85128 1 dna:chromosome chromosome:GRCh37:1:1:249250621:1 chr1
988c28e000e84c26d552359af1ea2e1d 10 dna:chromosome chromosome:GRCh37:10:1:135534747:1 chr10
98c59049a2df285c76ffb1c6db8f8b96 11 dna:chromosome chromosome:GRCh37:11:1:135006516:1 chr11
06cbf126247d89664a4faebad130fe9c GL000202.1 dna:supercontig supercontig::GL000202.1:1:40103:1 chr11_gl000202_random
51851ac0e1a115847ad36449b0015864 12 dna:chromosome chromosome:GRCh37:12:1:133851895:1 chr12
283f8d7892baa81b510a015719ca7b0b 13 dna:chromosome chromosome:GRCh37:13:1:115169878:1 chr13
98f3cae32b2a2e9524bc19813927542e 14 dna:chromosome chromosome:GRCh37:14:1:107349540:1 chr14
e5645a794a8238215b2cd77acb95a078 15 dna:chromosome chromosome:GRCh37:15:1:102531392:1 chr15
fc9b1a7b42b97a864f56b348b06095e6 16 dna:chromosome chromosome:GRCh37:16:1:90354753:1 chr16
351f64d4f4f9ddd45b35336ad97aa6de 17 dna:chromosome chromosome:GRCh37:17:1:81195210:1 chr17
96358c325fe0e70bee73436e8bb14dbd GL000203.1 dna:supercontig supercontig::GL000203.1:1:37498:1 chr17_gl000203_random
efc49c871536fa8d79cb0a06fa739722 GL000204.1 dna:supercontig supercontig::GL000204.1:1:81310:1 chr17_gl000204_random
d22441398d99caf673e9afb9a1908ec5 GL000205.1 dna:supercontig supercontig::GL000205.1:1:174588:1 chr17_gl000205_random
43f69e423533e948bfae5ce1d45bd3f1 GL000206.1 dna:supercontig supercontig::GL000206.1:1:41001:1 chr17_gl000206_random
b15d4b2d29dde9d3e4f93d1d0f2cbc9c 18 dna:chromosome chromosome:GRCh37:18:1:78077248:1 chr18
f3814841f1939d3ca19072d9e89f3fd7 GL000207.1 dna:supercontig supercontig::GL000207.1:1:4262:1 chr18_gl000207_random
1aacd71f30db8e561810913e0b72636d 19 dna:chromosome chromosome:GRCh37:19:1:59128983:1 chr19
aa81be49bf3fe63a79bdc6a6f279abf6 GL000208.1 dna:supercontig supercontig::GL000208.1:1:92689:1 chr19_gl000208_random
f40598e2a5a6b26e84a3775e0d1e2c81 GL000209.1 dna:supercontig supercontig::GL000209.1:1:159169:1 chr19_gl000209_random
d75b436f50a8214ee9c2a51d30b2c2cc GL000191.1 dna:supercontig supercontig::GL000191.1:1:106433:1 chr1_gl000191_random
325ba9e808f669dfeee210fdd7b470ac GL000192.1 dna:supercontig supercontig::GL000192.1:1:547496:1 chr1_gl000192_random
a0d9851da00400dec1098a9255ac712e 2 dna:chromosome chromosome:GRCh37:2:1:243199373:1 chr2
0dec9660ec1efaaf33281c0d5ea2560f 20 dna:chromosome chromosome:GRCh37:20:1:63025520:1 chr20
2979a6085bfe28e3ad6f552f361ed74d 21 dna:chromosome chromosome:GRCh37:21:1:48129895:1 chr21
851106a74238044126131ce2a8e5847c GL000210.1 dna:supercontig supercontig::GL000210.1:1:27682:1 chr21_gl000210_random
a718acaa6135fdca8357d5bfe94211dd 22 dna:chromosome chromosome:GRCh37:22:1:51304566:1 chr22
23dccd106897542ad87d2765d28a19a1 4 dna:chromosome chromosome:GRCh37:4:1:191154276:1 chr4
dbb6e8ece0b5de29da56601613007c2a GL000193.1 dna:supercontig supercontig::GL000193.1:1:189789:1 chr4_gl000193_random
6ac8f815bf8e845bb3031b73f812c012 GL000194.1 dna:supercontig supercontig::GL000194.1:1:191469:1 chr4_gl000194_random
0740173db9ffd264d728f32784845cd7 5 dna:chromosome chromosome:GRCh37:5:1:180915260:1 chr5
1d3a93a248d92a729ee764823acbbc6b 6 dna:chromosome chromosome:GRCh37:6:1:171115067:1 chr6
618366e953d6aaad97dbe4777c29375e 7 dna:chromosome chromosome:GRCh37:7:1:159138663:1 chr7
5d9ec007868d517e73543b005ba48535 GL000195.1 dna:supercontig supercontig::GL000195.1:1:182896:1 chr7_gl000195_random
96f514a9929e410c6651697bded59aec 8 dna:chromosome chromosome:GRCh37:8:1:146364022:1 chr8
d92206d1bb4c3b4019c43c0875c06dc0 GL000196.1 dna:supercontig supercontig::GL000196.1:1:38914:1 chr8_gl000196_random
6f5efdd36643a9b8c8ccad6f2f1edc7b GL000197.1 dna:supercontig supercontig::GL000197.1:1:37175:1 chr8_gl000197_random
3e273117f15e0a400f01055d9f393768 9 dna:chromosome chromosome:GRCh37:9:1:141213431:1 chr9
868e7784040da90d900d2d1b667a1383 GL000198.1 dna:supercontig supercontig::GL000198.1:1:90085:1 chr9_gl000198_random
569af3b73522fab4b40995ae4944e78e GL000199.1 dna:supercontig supercontig::GL000199.1:1:169874:1 chr9_gl000199_random
75e4c8d17cd4addf3917d1703cacaf25 GL000200.1 dna:supercontig supercontig::GL000200.1:1:187035:1 chr9_gl000200_random
dfb7e7ec60ffdcb85cb359ea28454ee9 GL000201.1 dna:supercontig supercontig::GL000201.1:1:36148:1 chr9_gl000201_random
7daaa45c66b288847b9b32b964e623d3 GL000211.1 dna:supercontig supercontig::GL000211.1:1:166566:1 chrUn_gl000211
563531689f3dbd691331fd6c5730a88b GL000212.1 dna:supercontig supercontig::GL000212.1:1:186858:1 chrUn_gl000212
9d424fdcc98866650b58f004080a992a GL000213.1 dna:supercontig supercontig::GL000213.1:1:164239:1 chrUn_gl000213
46c2032c37f2ed899eb41c0473319a69 GL000214.1 dna:supercontig supercontig::GL000214.1:1:137718:1 chrUn_gl000214
5eb3b418480ae67a997957c909375a73 GL000215.1 dna:supercontig supercontig::GL000215.1:1:172545:1 chrUn_gl000215
642a232d91c486ac339263820aef7fe0 GL000216.1 dna:supercontig supercontig::GL000216.1:1:172294:1 chrUn_gl000216
6d243e18dea1945fb7f2517615b8f52e GL000217.1 dna:supercontig supercontig::GL000217.1:1:172149:1 chrUn_gl000217
1d708b54644c26c7e01c2dad5426d38c GL000218.1 dna:supercontig supercontig::GL000218.1:1:161147:1 chrUn_gl000218
f977edd13bac459cb2ed4a5457dba1b3 GL000219.1 dna:supercontig supercontig::GL000219.1:1:179198:1 chrUn_gl000219
fc35de963c57bf7648429e6454f1c9db GL000220.1 dna:supercontig supercontig::GL000220.1:1:161802:1 chrUn_gl000220
3238fb74ea87ae857f9c7508d315babb GL000221.1 dna:supercontig supercontig::GL000221.1:1:155397:1 chrUn_gl000221
6fe9abac455169f50470f5a6b01d0f59 GL000222.1 dna:supercontig supercontig::GL000222.1:1:186861:1 chrUn_gl000222
399dfa03bf32022ab52a846f7ca35b30 GL000223.1 dna:supercontig supercontig::GL000223.1:1:180455:1 chrUn_gl000223
d5b2fc04f6b41b212a4198a07f450e20 GL000224.1 dna:supercontig supercontig::GL000224.1:1:179693:1 chrUn_gl000224
63945c3e6962f28ffd469719a747e73c GL000225.1 dna:supercontig supercontig::GL000225.1:1:211173:1 chrUn_gl000225
1c1b2cd1fccbc0a99b6a447fa24d1504 GL000226.1 dna:supercontig supercontig::GL000226.1:1:15008:1 chrUn_gl000226
a4aead23f8053f2655e468bcc6ecdceb GL000227.1 dna:supercontig supercontig::GL000227.1:1:128374:1 chrUn_gl000227
c5a17c97e2c1a0b6a9cc5a6b064b714f GL000228.1 dna:supercontig supercontig::GL000228.1:1:129120:1 chrUn_gl000228
d0f40ec87de311d8e715b52e4c7062e1 GL000229.1 dna:supercontig supercontig::GL000229.1:1:19913:1 chrUn_gl000229
b4eb71ee878d3706246b7c1dbef69299 GL000230.1 dna:supercontig supercontig::GL000230.1:1:43691:1 chrUn_gl000230
ba8882ce3a1efa2080e5d29b956568a4 GL000231.1 dna:supercontig supercontig::GL000231.1:1:27386:1 chrUn_gl000231
3e06b6741061ad93a8587531307057d8 GL000232.1 dna:supercontig supercontig::GL000232.1:1:40652:1 chrUn_gl000232
7fed60298a8d62ff808b74b6ce820001 GL000233.1 dna:supercontig supercontig::GL000233.1:1:45941:1 chrUn_gl000233
93f998536b61a56fd0ff47322a911d4b GL000234.1 dna:supercontig supercontig::GL000234.1:1:40531:1 chrUn_gl000234
118a25ca210cfbcdfb6c2ebb249f9680 GL000235.1 dna:supercontig supercontig::GL000235.1:1:34474:1 chrUn_gl000235
fdcd739913efa1fdc64b6c0cd7016779 GL000236.1 dna:supercontig supercontig::GL000236.1:1:41934:1 chrUn_gl000236
e0c82e7751df73f4f6d0ed30cdc853c0 GL000237.1 dna:supercontig supercontig::GL000237.1:1:45867:1 chrUn_gl000237
131b1efc3270cc838686b54e7c34b17b GL000238.1 dna:supercontig supercontig::GL000238.1:1:39939:1 chrUn_gl000238
99795f15702caec4fa1c4e15f8a29c07 GL000239.1 dna:supercontig supercontig::GL000239.1:1:33824:1 chrUn_gl000239
445a86173da9f237d7bcf41c6cb8cc62 GL000240.1 dna:supercontig supercontig::GL000240.1:1:41933:1 chrUn_gl000240
ef4258cdc5a45c206cea8fc3e1d858cf GL000241.1 dna:supercontig supercontig::GL000241.1:1:42152:1 chrUn_gl000241
2f8694fc47576bc81b5fe9e7de0ba49e GL000242.1 dna:supercontig supercontig::GL000242.1:1:43523:1 chrUn_gl000242
cc34279a7e353136741c9fce79bc4396 GL000243.1 dna:supercontig supercontig::GL000243.1:1:43341:1 chrUn_gl000243
0996b4475f353ca98bacb756ac479140 GL000244.1 dna:supercontig supercontig::GL000244.1:1:39929:1 chrUn_gl000244
89bc61960f37d94abf0df2d481ada0ec GL000245.1 dna:supercontig supercontig::GL000245.1:1:36651:1 chrUn_gl000245
e4afcd31912af9d9c2546acf1cb23af2 GL000246.1 dna:supercontig supercontig::GL000246.1:1:38154:1 chrUn_gl000246
7de00226bb7df1c57276ca6baabafd15 GL000247.1 dna:supercontig supercontig::GL000247.1:1:36422:1 chrUn_gl000247
5a8e43bec9be36c7b49c84d585107776 GL000248.1 dna:supercontig supercontig::GL000248.1:1:39786:1 chrUn_gl000248
1d78abec37c15fe29a275eb08d5af236 GL000249.1 dna:supercontig supercontig::GL000249.1:1:38502:1 chrUn_gl000249
7e0e2e580297b7764e31dbc80c2540dd X dna:chromosome chromosome:GRCh37:X:1:155270560:1 chrX
d89517b400226d3b56e753972a7cad67 chr17_ctg5_hap1 1680828
641e4338fa8d52a5b781bd2a2c08d3c3 chr3 198022430
fa24f81b680df26bcfb6d69b784fbe36 chr4_ctg9_hap1 590426
fe71bc63420d666884f37a3ad79f3317 chr6_apd_hap1 4622290
18c17e1641ef04873b15f40f6c8659a4 chr6_cox_hap2 4795371
2a3c677c426a10e137883ae1ffb8da3f chr6_dbb_hap3 4610396
9d51d4152174461cd6715c7ddc588dc8 chr6_mann_hap4 4683263
efed415dd8742349cb7aaca054675b9a chr6_mcf_hap5 4833398
094d037050cad692b57ea12c4fef790f chr6_qbl_hap6 4611984
3b6d666200e72bcc036bf88a4d7e0749 chr6_ssto_hap7 4928567
d2ed829b8a1628d16cbeee88e88e39eb chrM 16571
1e86411d73e6f00a10590f976be01623 chrY 59373566
fdfd811849cc2fadebc929bb925902e5 3 dna:chromosome chromosome:GRCh37:3:1:198022430:1 198022430
c68f52674c9fb33aef52dcf399755519 MT gi|251831106|ref|NC_012920.1| Homo sapiens mitochondrion, complete genome 16569
1fa3474750af0948bdf97d5a0ee52e51 Y dna:chromosome chromosome:GRCh37:Y:2649521:59034049:1 59373566
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment