Created
January 12, 2024 11:07
-
-
Save olegs/04022f5c35ef31af307087b15264e8ac to your computer and use it in GitHub Desktop.
Rapid unleashing of macrophage efferocytic capacity via transcriptional pause/release (commands.sh)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ATAC-seq & pro-seq & rna-seq analysis | |
### Analysis of all ATAC-seq datasets | |
``` | |
# chipseq-smk-pipeline | |
for FDR in 0.05; do | |
snakemake all --use-conda --cores all --directory /mnt/stripe/shpynov/2022_Atac-Seq-2/ --config fastq_ext=fastq.gz fastq_dir=/mnt/stripe/shpynov/2022_Atac-Seq-2/fastq genome=mm10 bowtie2_params="-X 2000 --dovetail" macs2_mode=narrow macs2_params="-q $FDR -f BAMPE --nomodel --nolambda -B --call-summits" macs2_suffix="q$FDR"; | |
done | |
``` | |
### Reads with additional adapters trimming Trimmomatic | |
``` | |
# Check trimming with trimmomatic | |
cd 2022_Atac-Seq-2/fastq | |
for F in *_R1_*; do | |
echo $F; N=${F/_R1_.fastq.gz/}; PF=${N}_R2_.fastq.gz; echo $PF; | |
trimmomatic PE -threads 8 -trimlog $N.txt -phred33 $F $PF ../trimmed/$F ../trimmed/${N}_unpaired_R1_.fastq.gz ../trimmed/$PF ../trimmed/${N}_unpaired_R2_.fastq.gz ILLUMINACLIP:/home/user/miniconda3/envs/bio/share/trimmomatic/adapters/NexteraPE-PE.fa:2:30:10:8:true; | |
done | |
``` | |
### And reprocess all data | |
``` | |
for FDR in 0.05; do snakemake all --use-conda --cores all --directory /mnt/stripe/shpynov/2022_Atac-Seq-2/ --config fastq_ext=fastq.gz fastq_dir=/mnt/stripe/shpynov/2022_Atac-Seq-2/trimmed genome=mm10 bowtie2_params="-X 2000 --dovetail" macs2_mode=narrow macs2_params="-q $FDR -f BAMPE --nomodel --nolambda -B --call-summits" macs2_suffix="q$FDR" -n; done | |
``` | |
### Check non-aligned reads vs hg38 | |
``` | |
cd /mnt/stripe/shpynov/2022_Atac-Seq-2/bams | |
for F in *.bam; do | |
echo $F; N=${F/.bam/}; | |
echo "Filtering" | |
# Extract unmapped read whose mate is mapped | |
samtools view -u -f 4 -F 264 $F > ../unmapped/${N}_unmapped_1.bam; | |
# Extract mapped read whose mate is unmapped | |
samtools view -u -f 8 -F 260 $F > ../unmapped/${N}_unmapped_2.bam; | |
# Extract unmapped read with unmapped mate | |
samtools view -u -f 12 -F 256 $F > ../unmapped/${N}_unmapped_3.bam; | |
echo "Merging" | |
samtools merge ../unmapped/${N}_unmapped.bam ../unmapped/${N}_unmapped_*.bam; | |
samtools sort -n -o ../unmapped/${N}_unmapped_sorted.bam ../unmapped/${N}_unmapped.bam; | |
echo "Bam to fastq" | |
bedtools bamtofastq -i ../unmapped/${N}_unmapped_sorted.bam -fq ../unmapped/${N}_unmapped.fastq; | |
rm ../unmapped/${N}_unmapped_*.bam; | |
done | |
``` | |
### Subsample all libraries to 20mln | |
``` | |
# Move previous to _full suffix | |
READS=20000000 | |
for F in *.bam; do | |
echo $F | |
FRACTION=$(samtools idxstats $F | cut -f3 | awk -v ct=$READS 'BEGIN {total=0} {total += $1} END {print ct/total}') | |
echo $FRACTION | |
samtools view -@ 4 -b -s ${FRACTION} $F > ${F/.bam/_20mln.bam} | |
done | |
# Visualization | |
for F in *.bam; do echo $F; bamCoverage -b $F -p 6 -o ${F/.bam/.bw}; done | |
# MACS2 peak calling | |
for F in *.bam; do echo $F; macs2 callpeak -t $F -g mm -q 0.05 -f BAMPE --nomodel --nolambda -B --call-summits -n ${F/.bam/_q0.05} &> | tee ${F/.bam/_macs2.log}; | |
done | |
# SPAN peak calling | |
for F in *.bam; do echo $F; java -jar ~/span-1.0.build.jar analyze -t $F -cs mm10.chrom.sizes --fdr 0.05 -peaks ${F/.bam/_q0.05.peak} &> ${F/.bam/_span.log}; | |
done | |
``` | |
### Compute consensus peaks for all the groups | |
``` | |
# IMPORTANT: Call bedtools merge after multiiter! | |
cd 2022_Atac-Seq-2/macs2 | |
T=$(mktemp); | |
for F in Alone 15min 30min 45min; do echo $F; bedtools multiinter -i $(ls *$F*.narrowPeak) | grep -E '.*(.*,){2,}.*' | awk -v OFS='\t' '{print $1,$2,$3}' > $T; bedtools merge -i $T > ${F}_macs2_cons3.bed3; | |
done | |
cd 2022_Atac-Seq-2/span | |
for F in Alone 15min 30min 45min; do echo $F; bedtools multiinter -i $(ls *$F*.peak) | grep -E '.*(.*,){2,}.*' | awk -v OFS='\t' '{print $1,$2,$3}' > $T; bedtools merge -i $T > ${F}_span_cons3.bed3; | |
done | |
``` | |
### R downstream analysis of consensus peaks | |
``` | |
library(ChIPpeakAnno) | |
m15 = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln/15min_macs2_cons3.bed3", format="BED", header=FALSE) | |
m30 = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln/30min_macs2_cons3.bed3", format="BED", header=FALSE) | |
m45 = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln//45min_macs2_cons3.bed3", format="BED", header=FALSE) | |
ma = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/macs2_20mln//Alone_macs2_cons3.bed3", format="BED", header=FALSE) | |
ol <- findOverlapsOfPeaks(m15, m30, m45, ma) | |
png('/home/oshpynov/data/2022_Atac-Seq-2/venn.png') | |
makeVennDiagram(ol) | |
dev.off() | |
library(TxDb.Mmusculus.UCSC.mm10.knownGene) | |
peaks <- GRangesList(m15=m15, m30=m30, m45=m45, ma=ma) | |
png('/home/oshpynov/data/2022_Atac-Seq-2/distribution.png', width=6, height=3, units="in", res=1200, pointsize = 1) | |
genomicElementDistribution(peaks, TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene, promoterRegion=c(upstream=2000, downstream=500), geneDownstream=c(upstream=0, downstream=2000)) | |
dev.off() | |
overlaps <- ol$peaklist[['m15///m30///m45///ma']] | |
library(EnsDb.Mmusculus.v79) | |
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene") | |
png('/home/oshpynov/data/2022_Atac-Seq-2/anno.png', width=5, height=3, units="in", res=1200, pointsize = 1) | |
binOverFeature(overlaps, annotationData=annoData,radius=5000, nbins=20, FUN=length, errFun=0, xlab="distance from TSS (bp)", ylab="count", main="Distribution of aggregated peak numbers around TSS") | |
dev.off() | |
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500)) | |
library(org.Mm.eg.db) | |
overlaps.anno <- addGeneIDs(overlaps.anno, "org.Mm.eg.db", IDs2Add = "entrez_id") | |
write.csv(as.data.frame(unname(overlaps.anno)), "anno.csv") | |
over <- getEnrichedGO(overlaps.anno, orgAnn="org.Mm.eg.db", condense=TRUE) | |
png('/home/oshpynov/data/2022_Atac-Seq-2/go.png', width=10, height=4, units="in", res=1200, pointsize = 1) | |
enrichmentPlot(over) | |
dev.off() | |
``` | |
### Diffbind analysis | |
``` | |
# Create DiffBind config | |
cd /mnt/stripe/shpynov/2022_Atac-Seq-2/diffbind | |
echo "SampleID,Tissue,Factor,Condition,Replicate,bamReads,ControlID,bamControl,Peaks,PeakCaller" > config.csv | |
for F in Mac-15min Mac-30min_AC Mac-45min_AC MacAlone; do for R in 1 2 3 4; do SAMPLE="$F-$R"; BAMREADS=$(find ~/data/2022_Atac-Seq-2/bams_20mln -name "$F*-${R}_20mln.bam"); PEAKS=$(find ~/data/2022_Atac-Seq-2/macs2_20mln -name "$F*-$R*.narrowPeak"); echo "$SAMPLE,,Type,$F,$R,$BAMREADS,,,$PEAKS,bed" >> config.csv; done; done | |
# R code | |
library(DiffBind) | |
setwd("~/data/2022_Atac-Seq-2/diff_alone_vs_45/") | |
samples = dba(sampleSheet = 'config.csv') | |
sample_counts = dba.count(samples) | |
png('counts.png') | |
plot(sample_counts) | |
dev.off() | |
sample_contrast = dba.contrast(sample_counts) | |
sample_contrast = dba.analyze(sample_contrast) | |
png('overlap.png') | |
overlap_rate = dba.overlap(sample_contrast, mode = DBA_OLAP_RATE) | |
plot(overlap_rate, type = "b", ylab = "# peaks", xlab = "Overlap at least this many peaksets") | |
dev.off() | |
png('pca.png') | |
dba.plotPCA(sample_contrast) | |
dev.off() | |
png('ma.png') | |
dba.plotMA(sample_contrast) | |
dev.off() | |
png('volcano.png') | |
dba.plotVolcano(sample_contrast) | |
dev.off() | |
sample_diff = dba.report(sample_contrast) | |
write.table(sample_diff, 'diff_alone_vs_45.csv', sep = ",", row.names = FALSE) | |
# BASH csv to bed3 | |
cat diff_alone_vs_45.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_alone_vs_45.bed3 | |
``` | |
### Analysis of differential peaks | |
``` | |
# Top 1000 most significant differential peaks | |
cat alone_vs_45.csv | grep -v 'start' | head -n 1000 | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > alone_vs_45_top1000.bed3 | |
# Motif enrichment analysis | |
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl alone_vs_45_top1000.bed3 mm10 alone_vs_45_top1000_motifs -size 1000 -mask; | |
# Functional annotation analysis with ChIPSeeker | |
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene | |
peakAnno <- annotatePeak(peaks, tssRegion=c(-3000, -3000), TxDb=txdb, annoDb="org.Mm.eg.db") | |
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/peakanno.png', width=5, height=2, units="in", res=1200, pointsize = 1) | |
plotAnnoBar(peakAnno) | |
dev.off() | |
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/tss.png', width=10, height=4, units="in", res=1200, pointsize = 1) | |
plotDistToTSS(peakAnno, title="Distribution relative to TSS") | |
dev.off() | |
# Functional annotation analysis with ChIPPeakAnno | |
library(ChIPpeakAnno) | |
peaks = toGRanges("/home/oshpynov/data/2022_Atac-Seq-2/diff/alone_vs_45.bed3", format="BED", header=FALSE) | |
library(TxDb.Mmusculus.UCSC.mm10.knownGene) | |
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/distribution.png', width=6, height=3, units="in", res=1200, pointsize = 1) | |
genomicElementDistribution(peaks, TxDb = TxDb.Mmusculus.UCSC.mm10.knownGene, promoterRegion=c(upstream=2000, downstream=500), geneDownstream=c(upstream=0, downstream=2000)) | |
dev.off() | |
library(EnsDb.Mmusculus.v79) | |
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene") | |
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/anno.png') | |
binOverFeature(peaks, annotationData=annoData,radius=5000, nbins=20, FUN=length, errFun=0, xlab="distance from TSS (bp)", ylab="count", main="Distribution of peaks around TSS") | |
dev.off() | |
library(org.Mm.eg.db) | |
peaks.anno <- annotatePeakInBatch(peaks, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500)) | |
peaks.anno <- addGeneIDs(peaks.anno, "org.Mm.eg.db", IDs2Add = "entrez_id") | |
write.csv(as.data.frame(unname(peaks.anno)), "anno.csv") | |
over <- getEnrichedGO(peaks.anno, orgAnn="org.Mm.eg.db", condense=TRUE) | |
png('/home/oshpynov/data/2022_Atac-Seq-2/diff/go.png', width=10, height=4, units="in", res=1200, pointsize = 1) | |
enrichmentPlot(over) | |
dev.off() | |
``` | |
### Motif enrichment with background | |
``` | |
# Motif enrichment analysis with background | |
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl alone_vs_45_top1000.bed3 mm10 alone_vs_45_top1000_motifs_vs_background -size 1000 -mask -bg alone_vs_45_background.bed3; | |
``` | |
### Prepare background | |
``` | |
# Prepare all peaks | |
T=$(mktemp) | |
bedtools multiinter -i *.narrowPeak | awk -v OFS='\t' '{print $1,$2,$3}' > $T; | |
bedtools merge -i $T > all_peaks.bed3 | |
# Prepare background as (all_peaks - peaks) + peaks | |
bedtools subtract -A -a ../macs2_20mln/all_peaks.bed3 -b alone_vs_45.bed3 > $T; | |
bedtools multiinter -i $T alone_vs_45.bed3 | sort -k1,1 -k2,2n > $T.2; | |
bedtools merge -i $T.2 > alone_vs_45_background.bed3; | |
``` | |
### Diffbind pairwise analysis | |
``` | |
# R code | |
library(DiffBind) | |
# Use one of these ... | |
setwd("~/data/2022_Atac-Seq-2/diff_alone_vs_15/") | |
setwd("~/data/2022_Atac-Seq-2/diff_alone_vs_30/") | |
setwd("~/data/2022_Atac-Seq-2/diff_15_vs_45/") | |
samples = dba(sampleSheet = 'config.csv') | |
sample_counts = dba.count(samples) | |
peaksets <- dba.peakset(sample_counts, bRetrieve = TRUE) | |
png('counts.png') | |
plot(sample_counts) | |
dev.off() | |
sample_contrast = dba.contrast(sample_counts) | |
sample_contrast = dba.analyze(sample_contrast) | |
png('overlap.png') | |
overlap_rate = dba.overlap(sample_contrast, mode = DBA_OLAP_RATE) | |
plot(overlap_rate, type = "b", ylab = "# peaks", xlab = "Overlap at least this many peaksets") | |
dev.off() | |
png('pca.png') | |
dba.plotPCA(sample_contrast) | |
dev.off() | |
png('ma.png') | |
dba.plotMA(sample_contrast) | |
dev.off() | |
png('volcano.png') | |
dba.plotVolcano(sample_contrast) | |
dev.off() | |
sample_diff = dba.report(sample_contrast) | |
write.table(sample_diff, 'diff_alone_vs_15.csv', sep = ",", row.names = FALSE) | |
# BASH csv to bed3 | |
cat diff_alone_vs_15.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_alone_vs_15.bed3 | |
# BASH csv to bed3 | |
cat diff_alone_vs_30.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_alone_vs_30.bed3 | |
# BASH csv to bed3 | |
cat diff_15_vs_45.csv | grep -v 'start' | sed 's/"//g' | awk -v FS=',' -v OFS='\t' '{print $1,$2,$3}' | sort -k1,1 -k2,2n > diff_15_vs_45.bed3 | |
``` | |
### Compute all differential peaks union | |
``` | |
# Compute all differential peaks union | |
T=$(mktemp) | |
bedtools multiinter -i $(find . -name "diff*.bed3") | awk -v OFS='\t' '{print $1,$2,$3}' > $T | |
bedtools merge -i $T > diff_pairwise.bed3 | |
# Create DiffBind config | |
cd /data/2022_Atac-Seq-2/diffbind | |
echo "SampleID,Tissue,Factor,Condition,Replicate,bamReads,ControlID,bamControl,Peaks,PeakCaller" > config.csv | |
for F in Mac-15min Mac-30min_AC Mac-45min_AC MacAlone; do for R in 1 2 3 4; do SAMPLE="$F-$R"; BAMREADS=$(find /data/2022_Atac-Seq-2/bams_20mln -name "$F*-${R}_20mln.bam"); echo "$SAMPLE,,Type,$F,$R,$BAMREADS,,,/data/2022_Atac-Seq-2/diff_pairwise.bed3,bed" >> config.csv; done; done | |
# R code for counts and genes annotating | |
library(DiffBind) | |
samples = dba(sampleSheet = 'config.csv') | |
sample_counts = dba.count(samples) | |
peaksets <- dba.peakset(sample_counts, bRetrieve = TRUE) | |
library(EnsDb.Mmusculus.v79) | |
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene") | |
library(org.Mm.eg.db) | |
peaksets <- annotatePeakInBatch(peaksets, AnnotationData=annoData) # Full annotation | |
peaksets <- addGeneIDs(peaksets, "org.Mm.eg.db", IDs2Add = "entrez_id") | |
write.csv(peaksets, 'diff_pairwise_counts.csv', row.names = FALSE) | |
# BASH | |
cat diff_pairwise_counts.csv | sed 's/,/\t/g' > diff_pairwise_counts.tsv | |
``` | |
# Integrate with DE RNA-seq data | |
``` | |
# Inspect number of DE genes with respect to change sign | |
cd de_filtered_tables | |
for F in *.csv; do | |
echo $F; | |
cat $F | grep -v log2 | wc -l; | |
cat $F | grep -v log2 | awk -v FS=',' '($3<0) {print $1}' | wc -l; | |
cat $F | grep -v log2 | awk -v FS=',' '($3>0) {print $1}' | wc -l; | |
done | |
``` | |
# Prepare peaks for annotation with related genes | |
``` | |
for F in diff_alone_vs_45/alone_vs_45.csv diff_alone_vs_15/alone_vs_15.csv diff_alone_vs_30/alone_vs_30.csv \ | |
diff_15_vs_30/diff_15_vs_30.csv diff_15_vs_45/diff_15_vs_45.csv diff_30_vs_45/diff_30_vs_45.csv; do | |
echo $F | |
wc -l $F | |
cat $F | grep -v seqname | awk -v FS=',' -v OFS='\t' '($9<0) {print $1,$2,$3}' | sed 's#"##g' |\ | |
grep -v -E 'chr[^\t]*_' | sort -k1,1 -k2,2n > ${F/.csv/_appear.bed3} | |
wc -l ${F/.csv/_appear.bed3} | |
cat $F | grep -v seqname | awk -v FS=',' -v OFS='\t' '($9>0) {print $1,$2,$3}' | sed 's#"##g' |\ | |
grep -v -E 'chr[^\t]*_' | sort -k1,1 -k2,2n > ${F/.csv/_disappear.bed3} | |
wc -l ${F/.csv/_disappear.bed3} | |
done | |
``` | |
Annotate DE ATAC-seq peaks with related genes | |
```R | |
library(ChIPpeakAnno) | |
library(org.Mm.eg.db) | |
library(EnsDb.Mmusculus.v79) | |
annoData <- toGRanges(EnsDb.Mmusculus.v79, feature="gene") | |
FILES = c( | |
"/data/2022_Atac-Seq-2/macs2_20mln/all_peaks.bed3", | |
"/data/2022_Atac-Seq-2/diff_alone_vs_45/alone_vs_45_appear.bed3", | |
"/data/2022_Atac-Seq-2/diff_alone_vs_45/alone_vs_45_disappear.bed3", | |
"/data/2022_Atac-Seq-2/diff_alone_vs_15/alone_vs_15_appear.bed3", | |
"/data/2022_Atac-Seq-2/diff_alone_vs_15/alone_vs_15_disappear.bed3", | |
"/data/2022_Atac-Seq-2/diff_alone_vs_30/alone_vs_30_appear.bed3", | |
"/data/2022_Atac-Seq-2/diff_alone_vs_30/alone_vs_30_disappear.bed3", | |
"/data/2022_Atac-Seq-2/diff_15_vs_30/diff_15_vs_30_appear.bed3", | |
"/data/2022_Atac-Seq-2/diff_15_vs_30/diff_15_vs_30_disappear.bed3", | |
"/data/2022_Atac-Seq-2/diff_15_vs_45/diff_15_vs_45_appear.bed3", | |
"/data/2022_Atac-Seq-2/diff_15_vs_45/diff_15_vs_45_disappear.bed3", | |
"/data/2022_Atac-Seq-2/diff_30_vs_45/diff_30_vs_45_appear.bed3", | |
"/data/2022_Atac-Seq-2/diff_30_vs_45/diff_30_vs_45_disappear.bed3" | |
) | |
for (file in FILES) { | |
print(file) | |
peaks = toGRanges(file, format="BED", header=FALSE) | |
peaks.anno <- annotatePeakInBatch(peaks, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(-2000, 500)) | |
peaks.anno <- addGeneIDs(peaks.anno, "org.Mm.eg.db", IDs2Add = "entrez_id") | |
write.csv(as.data.frame(unname(peaks.anno)), paste(file, ".csv", sep="")) | |
} | |
``` | |
Overlap genes from ATAC-seq DE and RNA-seq DE.\ | |
Check different timepoints DE results vs peak in ATAC-seq. | |
```bash | |
mkdir -p rna-atac | |
# Filter RNA-seq DE genes versus ATAC-seq files | |
for DE in de_filtered_tables/45_vs_alone_filter_table.csv de_filtered_tables/90_vs_alone_filter_table.csv de_filtered_tables/180_vs_alone_filter_table.csv; do | |
for P in all_peaks.bed3.csv \ | |
diff_alone_vs_45/alone_vs_45_appear.bed3.csv diff_alone_vs_45/alone_vs_45_disappear.bed3.csv \ | |
diff_alone_vs_30/alone_vs_30_appear.bed3.csv diff_alone_vs_30/alone_vs_30_disappear.bed3.csv \ | |
diff_alone_vs_15/alone_vs_15_appear.bed3.csv diff_alone_vs_15/alone_vs_15_disappear.bed3.csv \ | |
diff_15_vs_30/diff_15_vs_30_appear.bed3.csv diff_15_vs_30/diff_15_vs_30_disappear.bed3.csv \ | |
diff_15_vs_45/diff_15_vs_45_appear.bed3.csv diff_15_vs_45/diff_15_vs_45_disappear.bed3.csv \ | |
diff_30_vs_45/diff_30_vs_45_appear.bed3.csv diff_30_vs_45/diff_30_vs_45_disappear.bed3.csv; do | |
echo "Differential expression $DE $(cat $DE | wc -l) vs peaks $P $(cat $P | wc -l)"; | |
DEF=$(basename $DE | sed -E 's/\..*//'); | |
PF=$(basename $P | sed -E 's/\..*//'); | |
echo "Diff $DE < 0: $(cat $DE | grep -v log2 | awk -v FS=',' '($3<0) {print $8}' | wc -l) vs $P"; | |
touch rna-atac/rna_${DEF}_gt0_vs_atac_${PF}.csv; | |
cat $DE | grep -v log2 | awk -v FS=',' '($3<0) {print $8}' | sed -E 's/\..*//' | while read -r ENS; do | |
grep $ENS $P; done > rna-atac/rna_${DEF}_gt0_vs_atac_${PF}.csv; | |
echo "Diff $DE > 0: $(cat $DE | grep -v log2 | awk -v FS=',' '($3>0) {print $8}' | wc -l) vs $P"; | |
touch rna-atac/rna_${DEF}_ls0_vs_atac_${PF}.csv; | |
cat $DE | grep -v log2 | awk -v FS=',' '($3>0) {print $8}' | sed -E 's/\..*//' | while read -r ENS; do | |
grep $ENS $P; done > rna-atac/rna_${DEF}_ls0_vs_atac_${PF}.csv; | |
done; | |
done; | |
# Produce BED, TXT with gene name and Ensemble gene names | |
cd rna-atac | |
for F in *.csv; do | |
echo $F; | |
cat $F | awk -v FS=',' -v OFS='\t' '{print $2,$3,$4,$16,$12,$14,$15}' |\ | |
sed 's/"//g' | sort -k1,1 -k2,2n > ${F/.csv/.bed}; | |
cat $F | awk -v FS=',' '{print $16}' | sed 's/"//g' | sort --unique > ${F/.csv/_gene.txt}; | |
cat $F | awk -v FS=',' '{print $8}' | sed 's/"//g' | sort --unique > ${F/.csv/_ens.txt}; | |
wc -l ${F/.csv/_ens.txt}; | |
done | |
``` | |
Save positions of all ATAC-seq peaks associated with genes | |
``` | |
cat all_peaks.bed3.csv | grep -v seqnames | awk -v FS=',' -v OFS='\t' '{print $2,$3,$4,$8,$12,$14,$15}' |\ | |
sed 's/"//g' | sort -k1,1 -k2,2n > all_peaks.bed | |
cat all_peaks.bed3.csv | grep -v seqnames | awk -v FS=',' -v OFS='\t' '{print $8}' | sed 's/"//g' |\ | |
sort --unique > all_peaks.txt | |
``` | |
Analyze clusters of ATAC-seq data - launch `analysis.ipynb` jupyter notebook | |
``` | |
cd ~/data/2022_Atac-Seq-2 | |
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl diff_pairwise.bed3 mm10 diff_pairwise -mask | |
cd clusters | |
for F in *.bed3; do | |
perl ~/miniconda3/envs/bio/share/homer/bin/findMotifsGenome.pl $F mm10 ${F/.bed3/} -mask | |
done | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment