Skip to content

Instantly share code, notes, and snippets.

@conchoecia
Last active October 10, 2018 00:27
Show Gist options
  • Save conchoecia/cb27dc2ca94dcd8ccb89baa2fc4c01dc to your computer and use it in GitHub Desktop.
Save conchoecia/cb27dc2ca94dcd8ccb89baa2fc4c01dc to your computer and use it in GitHub Desktop.
get info on linker content in HiC libaries
#!/bin/bash
# this script makes plots of positions of linker sequences in HiC/Chicago libraries
#!/bin/bash
function processthis {
OUT1="${LIB}fpos.txt"
IN1="${LIB}_f.fastq.gz"
OUT2="${LIB}rpos.txt"
IN2="${LIB}_r.fastq.gz"
OUT3="${LIB}fpos_4.txt"
OUT4="${LIB}rpos_4.txt"
OUT5="${LIB}fpos_4_noAATTAATT.txt"
OUT6="${LIB}rpos_4_noAATTAATT.txt"
OUT7="${LIB}fpos_noAATT_start.txt"
OUT8="${LIB}rpos_noAATT_start.txt"
TABLE="${LIB}_table.txt"
PLOT="${LIB}_linkerpos.pdf"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{for(i=1; i<=75; i++){pos[i]=0}} {for (i=1; i<=75-8+1; i++) {thisseq=substr($seq, i, 8); if (thisseq==cutsite){pos[i] += 1} } } END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN1}" > "${OUT1}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{foursite=substr(cutsite,1,4); for(i=1; i<=75; i++){pos[i]=0}} {for (i=1; i<=75-8+1; i++) {thisseq=substr($seq, i, 4); if (thisseq==foursite){pos[i] += 1} } } END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN1}" > "${OUT3}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{foursite=substr(cutsite,1,4); for(i=1; i<=75; i++){pos[i]=0}} {cont=1; for (i=1; i<=75-8+1; i++) {if (substr($seq, i, 8)==cutsite){ cont=0}}; if (cont==1){for (i=1; i<=75-8+1; i++) {thisseq=substr($seq, i, 4); if (thisseq==foursite){pos[i] += 1} } }} END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN1}" > "${OUT5}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{foursite=substr(cutsite,1,4); for(i=1; i<=75; i++){pos[i]=0}} {cont=1; if (substr($seq, 1, 4)==foursite){ cont=0}; if (cont==1){for (i=1; i<=75-8+1; i++) {if (substr($seq, i, 8)==cutsite){pos[i] += 1} } } } END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN1}" > "${OUT7}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{for(i=1; i<=75; i++){pos[i]=0}} {for (i=1; i<=75-8+1; i++) {thisseq=substr($seq, i, 8); if (thisseq==cutsite){pos[i] += 1} } } END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN2}" > "${OUT2}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{foursite=substr(cutsite,1,4); for(i=1; i<=75; i++){pos[i]=0}} {for (i=1; i<=75-8+1; i++) {thisseq=substr($seq, i, 4); if (thisseq==foursite){pos[i] += 1} } } END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN2}" > "${OUT4}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{foursite=substr(cutsite,1,4); for(i=1; i<=75; i++){pos[i]=0}} {cont=1; for (i=1; i<=75-8+1; i++) {if (substr($seq, i, 8)==cutsite){ cont=0}}; if (cont==1){for (i=1; i<=75-8+1; i++) {thisseq=substr($seq, i, 4); if (thisseq==foursite){pos[i] += 1} } }} END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN2}" > "${OUT6}"
bioawk -v cutsite="${SITE}" -cfastx 'BEGIN{foursite=substr(cutsite,1,4); for(i=1; i<=75; i++){pos[i]=0}} {cont=1; if (substr($seq, 1, 4)==foursite){ cont=0}; if (cont==1){for (i=1; i<=75-8+1; i++) {if (substr($seq, i, 8)==cutsite){pos[i] += 1} } } } END{for (i=1; i<=75-8+1; i++) { printf("%d\t%d\n", i, pos[i])} }' "${IN2}" > "${OUT8}"
paste "${OUT1}" "${OUT2}" "${OUT3}" "${OUT4}" "${OUT5}" "${OUT6}" "${OUT7}" "${OUT8}"| awk '{printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $4, $6, $8, $10, $12, $14, $16)}' > "${TABLE}"
Rscript plot_table.R "${TABLE}" "${PLOT}" "${TITLE}"
}
SITE="AATTAATT"
LIB="DS249"
TITLE="DS249 AATTAATT HiC - Freeze-dried sample - NEB FS kit - ~50,000 reads"
processthis
args <- commandArgs(trailingOnly = TRUE)
print(args)
# trailingOnly=TRUE means that only your arguments are returned, check:
# print(commandArgs(trailingOnly=FALSE))
graph = args[1]
output = args[2]
title = args[3]
rm(args)
pdf(file=output)
df = read.table(graph,
sep="\t", header = FALSE, row.names = NULL)
plot(c(df$V1, df$V1), c(df$V2, df$V3),
pch=c(rep.int(1, length(df$V1)),rep.int(3, length(df$V1))),
main = title,
cex.main=0.5,
xlab = "position in read",
ylab = "number of occurrences of linker")
lines(df$V1, df$V2, lwd=2, col=rgb(1, 0, 0, 0.5))
lines(df$V1, df$V3, lwd=2, col=rgb(0, 0, 1, 0.5))
legend(55, max(max(df$V2), max(df$V3)) ,
legend=c("R1", "R2"),
col=c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), lty=c(1,1),
lwd = c(2, 2), cex=1.2, pch = c(1, 3) )
plot(c(df$V1, df$V1), c(df$V4, df$V5),
pch=c(rep.int(1, length(df$V1)),rep.int(3, length(df$V1))),
main = title,
cex.main=0.5,
xlab = "position in read",
ylab = "# linker[0:4]")
lines(df$V1, df$V4, lwd=2, col=rgb(1, 0, 0, 0.5))
lines(df$V1, df$V5, lwd=2, col=rgb(0, 0, 1, 0.5))
legend(55, max(max(df$V4), max(df$V5)) ,
legend=c("R1", "R2"),
col=c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), lty=c(1,1),
lwd = c(2, 2), cex=1.2, pch = c(1, 3) )
par(mfrow=c(2,1),
oma = c(3, 3, 0, 0)+0.2, # two rows of text at the outer left and bottom margin
mar = c(1, 1, 1, 1)+0.2, # space for one row of text at ticks and to separate plots
mgp = c(2, 1, 0), # axis label at 2 rows distance, tick labels at 1 row
xpd = NA)
plot(c(df$V1, df$V1), c(df$V2, df$V3),
pch=c(rep.int(1, length(df$V1)),rep.int(3, length(df$V1))),
main = title,
cex.main=0.5,
xlab = "",
ylab = "number of occurrences of linker")
lines(df$V1, df$V2, lwd=2, col=rgb(1, 0, 0, 0.5))
lines(df$V1, df$V3, lwd=2, col=rgb(0, 0, 1, 0.5))
legend(55, max(max(df$V2), max(df$V3)) ,
legend=c("R1", "R2"),
col=c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), lty=c(1,1),
lwd = c(2, 2), cex=1.2, pch = c(1, 3) )
plot(c(df$V1, df$V1), c(df$V4, df$V5),
pch=c(rep.int(1, length(df$V1)),rep.int(3, length(df$V1))),
xlab = "position in read",
ylab = "# linker[0:4]")
lines(df$V1, df$V4, lwd=2, col=rgb(1, 0, 0, 0.5))
lines(df$V1, df$V5, lwd=2, col=rgb(0, 0, 1, 0.5))
legend(55, max(max(df$V4), max(df$V5)) ,
legend=c("R1", "R2"),
col=c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), lty=c(1,1),
lwd = c(2, 2), cex=1.2, pch = c(1, 3) )
par(mfrow=c(2,1),
oma = c(3, 3, 0, 0)+0.2, # two rows of text at the outer left and bottom margin
mar = c(1, 1, 1, 1)+0.2, # space for one row of text at ticks and to separate plots
mgp = c(2, 1, 0), # axis label at 2 rows distance, tick labels at 1 row
xpd = NA)
plot(c(df$V1, df$V1), c(df$V6, df$V7),
pch=c(rep.int(1, length(df$V6)),rep.int(3, length(df$V7))),
main = title,
xlab="",
cex.main=0.5,
ylab = "# linker[0:4] if no linker[0:8] in read")
lines(df$V1, df$V6, lwd=2, col=rgb(1, 0, 0, 0.5))
lines(df$V1, df$V7, lwd=2, col=rgb(0, 0, 1, 0.5))
legend(55, max(max(df$V6), max(df$V7)) ,
legend=c("R1", "R2"),
col=c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), lty=c(1,1),
lwd = c(2, 2), cex=1.2, pch = c(1, 3) )
plot(c(df$V1, df$V1), c(df$V8, df$V9),
pch=c(rep.int(1, length(df$V8)),rep.int(3, length(df$V9))),
xlab = "position in read",
ylab = "# linker[0:8] if no linker[0:4] at pos0")
lines(df$V1, df$V8, lwd=2, col=rgb(1, 0, 0, 0.5))
lines(df$V1, df$V9, lwd=2, col=rgb(0, 0, 1, 0.5))
legend(55, max(max(df$V8), max(df$V9)) ,
legend=c("R1", "R2"),
col=c(rgb(1, 0, 0, 0.5), rgb(0, 0, 1, 0.5)), lty=c(1,1),
lwd = c(2, 2), cex=1.2, pch = c(1, 3) )
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment