Skip to content

Instantly share code, notes, and snippets.

@thash
Created March 23, 2016 20:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thash/1c09cefda74b5e5aeb9f to your computer and use it in GitHub Desktop.
Save thash/1c09cefda74b5e5aeb9f to your computer and use it in GitHub Desktop.
// sudo yum update
// sudo yum install -y git
// sudo wget http://repos.fedorapeople.org/repos/dchen/apache-maven/epel-apache-maven.repo -O /etc/yum.repos.d/epel-apache-maven.repo
// sudo sed -i s/\$releasever/6/g /etc/yum.repos.d/epel-apache-maven.repo
// sudo yum install -y apache-maven
// git clone https://github.com/bigdatagenomics/adam.git
// cd adam
// git checkout adam-parent-0.15.0
// export "MAVEN_OPS=-Xmx512m -XX:MaxPermSize=128m"
// # long
// mvn clean package -DskipTests
// export ADAM_HOME=`pwd`
// alias adam-submit="$ADAM_HOME/bin/adam-submit"
// export SPARK_HOME=/usr/lib/spark
// cat >> ~/.bashrc
// export "MAVEN_OPS=-Xmx512m -XX:MaxPermSize=128m"
// export ADAM_HOME=/home/hadoop/adam
// export SPARK_HOME=/usr/lib/spark
// alias adam-submit="$ADAM_HOME/bin/adam-submit"
// ^C
///////////////////////////////
//
// $ adam-submit
// SPARK_HOME must be set for 'adam-submit'
//
// $ look inside adam-submit...
// # Find spark-submit script
// if [ -z "$SPARK_HOME" ]; then
// echo "SPARK_HOME must be set for 'adam-submit'"
// exit 1
// else
// SPARK_SUBMIT="$SPARK_HOME"/bin/spark-submit
// fi
// [root@ip-172-31-28-194 ~]# find / -name "*spark*"
// # => たぶん /usr/lib/spark/
// /home/hadoop/adam/bin/adam-submit: 行 64: /usr/lib/spark/bin/utils.sh: そのようなファイルやディレクトリはありません
// http://se.bunri-u.ac.jp/~yamamoto/hadoop5/SetupCDH5/spark/index.html
// spark-shell
// /usr/lib/spark/bin/spark-shell: 行 44: /usr/lib/spark/bin/utils.sh: そのようなファイルやディレクトリはありません
// というエラーが表示されるので, https://github.com/apache/spark/blob/master/bin/utils.sh のutils.shを/usr/lib/spark/bin/utils.shにする.
// # https://github.com/apache/spark/tree/master/bin => nai
// # https://github.com/apache/spark/tree/v1.2.0/bin => aru
// curl -O https://raw.githubusercontent.com/apache/spark/v1.2.0/bin/utils.sh
// chmod +x utils.sh
// sudo mv utils.sh /usr/lib/spark/bin/
// # 1.2 前提らしいがいちおう動くわ
// authorized_keys に追加すれば aws emr ssh も動く
import org.apache.spark.rdd.RDD
import org.bdgenomics.adam.rdd.ADAMContext._
import org.bdgenomics.formats.avro.AlignmentRecord
val readsRDD: RDD[AlignmentRecord] = sc.adamLoad("/user/ds/genomics/reads/HG00103.adam")
readsRDD.first()
// res0: org.bdgenomics.formats.avro.AlignmentRecord = {"contig": {"contigName": "1",
// "contigLength": 249250621,
// "contigMD5": "1b22b98cdeb4a9304cb5d48026a85128",
// "referenceURL": "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human",
// "assembly": null,
// "species": null},
// "start": 9992,
// "oldPosition": null,
// "end": 10091,
// "mapq": 25,
// "readName": "SRR062643.12466352",
// "sequence": "CTCTTCCGATCTCCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCT",
// "qual": "##@@BA:36<FBGCBBD>AHHB@4DD@B;0DEF6A9EDC6>9CCC@9@IIH@I8IIC4@GH=HGHCIHHHGAGABEGAGG@EGAFHGFFEEE?DEFDDA.",
// "cigar": "1S99M",
// "oldCigar": null,
// "basesTrimmedFromStart": 0,
// "basesTrimmedFromEnd": 0,
// "readPaired": true,
// "properP...
readsRDD.count()
// res1: Long = 160397565
val uniq_chr = (readsRDD.map(_.contig.contigName.toString).distinct().collect())
// scala> uniq_chr
// res2: Array[String] = Array(GL000192.1, GL000194.1, GL000196.1, GL000231.1, GL000198.1, GL000210.1, GL000233.1, GL000212.1, GL000235.1, GL000214.1, GL000237.1, GL000216.1, GL000239.1, GL000218.1, 10, 11, 12, 13, 14, 15, 16, GL000240.1, 17, 18, MT, 19, GL000242.1, GL000221.1, GL000200.1, GL000244.1, GL000223.1, GL000246.1, GL000202.1, GL000225.1, GL000204.1, GL000248.1, GL000227.1, GL000206.1, 1, GL000229.1, 2, GL000208.1, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, hs37d5, GL000191.1, GL000193.1, GL000195.1, GL000230.1, GL000197.1, GL000199.1, GL000232.1, GL000211.1, GL000234.1, GL000213.1, GL000236.1, GL000215.1, GL000238.1, GL000217.1, GL000219.1, X, Y, GL000241.1, GL000220.1, GL000243.1, GL000222.1, GL000245.1, GL000201.1, GL000224.1, GL000247.1, GL000203.1, NC_007605, GL000226.1, GL000205.1, G...
// "すべてヒトの染色体に由来するもの" であること
uniq_chr.sorted.foreach(println)
// 1
// 10
// 11
// 12
// 13
// 14
// 15
// 16
// 17
// 18
// 19
// 2
// 20
// 21
// 22
// 3
// 4
// 5
// 6
// 7
// 8
// 9
// GL000191.1
// ...
// GL000248.1
// GL000249.1
// MT
// NC_007605
// X
// Y
// hs37d5
val cftr_reads = (readsRDD
.filter(_.contig.contigName.toString == "7")
.filter(_.start <= 117149189)
.filter(_.end > 117149189)
.collect())
cftr_reads.length // res6: Int = 9
// scala> cftr_reads
// res5: Array[org.bdgenomics.formats.avro.AlignmentRecord] = Array({"contig": {"contigName": "7", "contigLength": 159138663, "contigMD5": "618366e953d6aaad97dbe4777c29375e", "referenceURL": "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz AS:NCBI37 SP:Human", "assembly": null, "species": null}, "start": 117149104, "oldPosition": null, "end": 117149204, "mapq": 60, "readName": "SRR062642.24026612", "sequence": "TGGCTTCAAAGAAAAATCCTAAACTCATTAATGCCCTTCGGCGATGTTTTTTCTGGAGATTTATGTTCTATGGAATCTTTTTATATTTAGGGGTAAGGAT", "qual": "/LNNPNOPPPPPQQQQPQQLPQQPRQRPQPRQRRRRSQQHRHILLI?MLLHH?D7ICECFMMEEDKN@OCIBJIOIIKQQJJ?C@JIJD?<EEDCED?B>", "cigar": "100M", "oldCigar": null, "basesTrimmedFromStart": 0, "basesTrimmedFromEnd": 0, "readPa...
import org.bdgenomics.adam.predicates.ColumnReaderInput._
import org.bdgenomics.adam.predicates.ADAMPredicate
import org.bdgenomics.adam.predicates.RecordCondition
import org.bdgenomics.adam.predicates.FieldCondition
class CftrLocusPredicate extends ADAMPredicate[AlignmentRecord] {
override val recordCondition = RecordCondition[AlignmentRecord](
FieldCondition(
"contig.contigName", (x: String) => x == "chr7"),
FieldCondition(
"start", (x: Long) => x <= 117149189),
FieldCondition(
"end", (x: Long) => x >= 117149189)
)
}
val cftr_reads2 = sc.adamLoad[AlignmentRecord, CftrLocusPredicate](
"/user/ds/genomics/reads/HG00103.adam",
Some(classOf[CftrLocusPredicate])).collect()
// ERROR.............
// org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path does not exist: hdfs://ip-172-31-28-194.ap-northeast-1.compute.internal:8020/user/ds/genomics/reads/HG00103
// -------------
// from README.md
```bash
hadoop fs -mkdir /user/ds/genomics/dnase
curl -s -L "https://www.encodeproject.org/files/ENCFF001UVC/@@download/ENCFF001UVC.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/GM12878.DNase.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001UWQ/@@download/ENCFF001UWQ.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/K562.DNase.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001WEI/@@download/ENCFF001WEI.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/BJ.DNase.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001UVQ/@@download/ENCFF001UVQ.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/HEK293.DNase.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001SOM/@@download/ENCFF001SOM.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/H54.DNase.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001UVU/@@download/ENCFF001UVU.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/dnase/HepG2.DNase.narrowPeak
```
GENCODE data:
```bash
curl -s -L "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_18/gencode.v18.annotation.gtf.gz" | gunzip | hadoop fs -put - /user/ds/genomics/gencode.v18.annotation.gtf
```
ChIP-seq data for CTCF:
```bash
hadoop fs -mkdir /user/ds/genomics/chip-seq
curl -s -L "https://www.encodeproject.org/files/ENCFF001VED/@@download/ENCFF001VED.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/GM12878.ChIP-seq.CTCF.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001VMZ/@@download/ENCFF001VMZ.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/K562.ChIP-seq.CTCF.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001XMU/@@download/ENCFF001XMU.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/BJ.ChIP-seq.CTCF.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001XQU/@@download/ENCFF001XQU.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/HEK293.ChIP-seq.CTCF.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001USC/@@download/ENCFF001USC.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/H54.ChIP-seq.CTCF.narrowPeak
curl -s -L "https://www.encodeproject.org/files/ENCFF001XRC/@@download/ENCFF001XRC.bed.gz" | gunzip | hadoop fs -put - /user/ds/genomics/chip-seq/HepG2.ChIP-seq.CTCF.narrowPeak
```
[hadoop@ip-172-31-28-194 dnase]$ hadoop fs -du -h /user/ds/genomics/
15.9 G /user/ds/genomics/HG00103.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam
20.8 M /user/ds/genomics/chip-seq
41.1 M /user/ds/genomics/dnase
1.0 G /user/ds/genomics/gencode.v18.annotation.gtf
12.7 G /user/ds/genomics/reads
[hadoop@ip-172-31-28-194 ~]$ hadoop fs -du -h /user/ds/genomics/dnase/
7.5 M /user/ds/genomics/dnase/BJ.DNase.narrowPeak
7.0 M /user/ds/genomics/dnase/GM12878.DNase.narrowPeak
7.4 M /user/ds/genomics/dnase/H54.DNase.narrowPeak
6.4 M /user/ds/genomics/dnase/HEK293.DNase.narrowPeak
6.5 M /user/ds/genomics/dnase/HepG2.DNase.narrowPeak
6.3 M /user/ds/genomics/dnase/K562.DNase.narrowPeak
[hadoop@ip-172-31-28-194 dnase]$ hadoop fs -du -h /user/ds/genomics/chip-seq
2.6 M /user/ds/genomics/chip-seq/BJ.ChIP-seq.CTCF.narrowPeak
4.7 M /user/ds/genomics/chip-seq/GM12878.ChIP-seq.CTCF.narrowPeak
3.6 M /user/ds/genomics/chip-seq/H54.ChIP-seq.CTCF.narrowPeak
2.1 M /user/ds/genomics/chip-seq/HEK293.ChIP-seq.CTCF.narrowPeak
2.6 M /user/ds/genomics/chip-seq/HepG2.ChIP-seq.CTCF.narrowPeak
5.1 M /user/ds/genomics/chip-seq/K562.ChIP-seq.CTCF.narrowPeak
phyloP data:
```bash
hadoop fs -mkdir /user/ds/genomics/phylop_text
for i in $(seq 1 22); do
echo "chr$i.phyloP46way.wigFix.gz"
curl -s -L "http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/phyloP46way/vertebrate/chr$i.phyloP46way.wigFix.gz" | gunzip | adam-submit wigfix2bed | hadoop fs -put - "/user/ds/genomics/phylop_text/chr$i.phyloP46way.wigFix"
done
curl -s -L "http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/phyloP46way/vertebrate/chrX.phyloP46way.wigFix.gz" | gunzip | adam-submit wigfix2bed | hadoop fs -put - /user/ds/genomics/phylop_text/chrX.phyloP46way.wigFix
curl -s -L "http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/phyloP46way/vertebrate/chrY.phyloP46way.wigFix.gz" | gunzip | adam-submit wigfix2bed | hadoop fs -put - /user/ds/genomics/phylop_text/chrY.phyloP46way.wigFix
```
[hadoop@ip-172-31-28-194 dnase]$ hadoop fs -du -h /user/ds/genomics/phylop_text
2.6 K /user/ds/genomics/phylop_text/chr1.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr10.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr11.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr12.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr13.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr14.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr15.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr16.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr17.phyloP46way.wigFix
2.6 K /user/ds/genomics/phylop_text/chr18.phyloP46way.wigFix
...
adamBEDFeatureLoad
が見つからない
import org.bdgenomics.adam.rdd.ADAMContext
val ac = new ADAMContext(sc)
ac.adamLoad("/user/hadoop/genomics/phylop_text")
// save がわからん
scala> val bHg19Data = sc.broadcast(
| new TwoBitFile(
| new LocalFileByteAccess(
| new File("/user/ds/genomics/hg19.2bit"))))
<console>:51: error: not found: type TwoBitFile
new TwoBitFile(
^
scala> import org.bdgenomics.adam.util.{TwoBitFile, SequenceUtils}
<console>:48: error: object TwoBitFile is not a member of package org.bdgenomics.adam.util
import org.bdgenomics.adam.util.{TwoBitFile, SequenceUtils}
^
// -------------------------------
// 1000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment