Created
July 14, 2017 07:23
-
-
Save kloetzl/1dd24925c031d85fc9c3222c4aed8b00 to your computer and use it in GitHub Desktop.
reaper source
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* (C) Copyright 2011, 2012, 2013, 2014, 2015 European Molecular Biology Laboratory. | |
* Author: Stijn van Dongen <stijn@ebi.ac.uk>. | |
* Contact: <kraken@ebi.ac.uk> | |
* | |
* This file is part of Reaper. Reaper is free software: you can redistribute | |
* it and/or modify it under the terms of the GNU General Public License as | |
* published by the Free Software Foundation; either version 3 of the License, | |
* or (at your option) any later version. This program is distributed in the | |
* hope that it will be useful, but WITHOUT ANY WARRANTY; without even the | |
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. | |
* See the GNU General Public License for more details. You should have | |
* received a copy of the GNU General Public License along with this program. | |
* If not, see http://www.gnu.org/licenses/. | |
*/ | |
/* | |
This is reaper, a flexible and fast program for pre-processing short-read sequencing data. | |
- Match read with 3p-adaptor | |
- uses Smith-Waterman local alignment over full read and adaptor | |
1. The best local alignment is considered and tested against user-set thresholds | |
2. Optionally weaker matches between read-suffix and adaptor-prefix can be considered as well | |
- read/adaptor match is user controlled (alignment identity threshold) | |
- handling of poly-A (optional) | |
- handling of N in sequence (optional) | |
- handling of B quality scores (optional) | |
- Check for ligation contamination | |
- Smith-Waterman local alignment | |
- Handles barcodes | |
- Output split into per-barcode clean files and annotated discard files | |
- Recognises different layouts of barcodes and sequence inserts | |
- Sane defaults and adaptable. | |
- Outputs per-barcode summary statistics for comprehensive QC plots | |
- Output fields can be chosen and formatted | |
- Per-read characteristics available (e.g. alignment, trinucleotide score) | |
- One read-at-time, written in C | |
- Small memory footprint (order of megabytes) | |
- 1-4 million reads per minute (depending on read length, contamination check) | |
- Twinned with 'tally' for fast uniquifying of reads | |
NOTES | |
* Implementation caveat I. | |
param->buck[buck_n] exists, contrary to normal C convention. | |
qc_tally_in and qc_tally_out write in this bucket - it is used for | |
grand-total reporting. See /specialbucket/. | |
This has the advantage that normal bucket processing code looks normal, and | |
will not screw up the special bucket - only memory management looks peculiar. | |
* Implementation caveat II. | |
The code mixes 0-based ofsets and 1-based ofsets. The first derive | |
from normal C-strings, the second from alignment matrices -- this | |
is because they are zero-bordered; the first base in e.g. the adaptor | |
maps to the second column of the alignment matrix, indexed 1. | |
See /stringoffsets/. | |
* Implementation Note. | |
do_adaptor_3p and do_barcode_3p are quite different. The former allows the | |
absence of a match, the latter requires exactly one match. The former allows | |
more laxness when having an end-to-start match, and additionally allows a | |
3p-adaptor prefix match if it is folllowed by low complexity code. | |
*/ | |
#define REAPER_FAIL 29 | |
#define REAPER_OK 0 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <stdarg.h> | |
#include <signal.h> | |
#include <limits.h> | |
#include <time.h> | |
#include <zlib.h> | |
#include <ctype.h> | |
#include "reaper.h" | |
#include "version.h" | |
#include "sw.h" | |
#include "slib.h" | |
#include "trint.h" | |
#include "table.h" | |
#include "dna.h" | |
#ifndef MAXFIELDSIZE | |
# define MAXFIELDSIZE 1023 | |
#endif | |
int g_debug = 0; | |
int g_guard = 0; | |
int g_restrict = 18; | |
unsigned long g_fieldtosmall = 0; | |
/* NOTE these are coupled to the below array */ | |
enum program_mode | |
{ CASE_NOTSPECIFIED = 0 | |
/* Caveat: we use the test g_mode > CASE_NOBC in the program */ | |
, CASE_NOBC | |
, CASE_3P_BC | |
, CASE_5P_BC | |
} ; | |
/* NOTE these are coupled to the above enum */ | |
const char* G_MODE[6] | |
= { "NA" | |
, "no-bc" | |
, "3p-bc" | |
, "5p-bc" | |
}; | |
enum program_mode g_mode = CASE_NOTSPECIFIED; | |
const char* x_3pgb[2] = { "3p-global -", "3p-global +" }; | |
const char* x_tabu[2] = { "tabu -", "tabu +" }; | |
const char* x_3ppx[2] = { "3p-prefix -", "3p-prefix +" }; | |
const char* x_3pbc[2] = { "3p-barcode -", "3p-barcode +" }; | |
const char* x_5pbc[2] = { "5p-barcode -", "5p-barcode +" }; | |
const char* x_5psi[2] = { "3p-sinsert -", "3p-sinsert +" }; | |
int dna[5] = { 'A', 'C', 'G', 'T', 'N' }; | |
#define MATRIX_SIZE 8192 | |
#define MAXQUALITY 256 | |
#define dim unsigned | |
struct qc_composition | |
{ dim A[MAXFIELDSIZE] | |
; dim C[MAXFIELDSIZE] | |
; dim G[MAXFIELDSIZE] | |
; dim T[MAXFIELDSIZE] | |
; dim N[MAXFIELDSIZE] | |
; | |
} ; | |
struct atabu | |
{ char* tabuseq | |
; unsigned tabulen | |
; struct match_requirement mr_tabu | |
; struct atabu* next | |
; | |
} ; | |
/* Initially only used for barcode funneling, but can also be used for | |
* other purposes. This bucket design with multiple purposes does not | |
* yet have a transparent design: the last bucket in struct param is | |
* the only non-barcode bucket, and is special. | |
* - barcodes_io_open only opens streams for the barcode buckets. | |
*/ | |
struct bucket | |
{ const char* barcode3p | |
; const char* sinsert3p | |
; const char* adaptor3p | |
; const char* barcode5p | |
; const char* sinsert5p | |
; const char* cat3p /* new */ | |
; int sinsert3p_n /* new */ | |
; int sinsert5p_n /* new */ | |
; const char* filetag /* use instead of barcode in file name */ | |
; struct atabu tabu2 /* fixme, linked list with first link not freeable, designtabu | |
* The '2' in tabu2 has no meaning -- artefact from rewrite, | |
* kept around because it makes searching easier. | |
* Too lazy to think of something better. | |
*/ | |
; int barcode_n | |
; int barcode_start | |
; int barcode_end | |
; int cache_lint_offset | |
; int cache_score | |
; int cache_accept | |
; char* fname | |
; struct qc_composition qc_cpsin /* input read composition */ | |
; struct qc_composition qc_cpsout /* clean read composition */ | |
; unsigned global_in_max_size | |
; unsigned global_out_max_size | |
; dim qc_lenout[MAXFIELDSIZE+1] /* clean read length */ | |
; dim * qc_bcq /* base call quality, one block is one read offet | |
* read-offset 0: q0, q1, q2 ..... q255 | |
* read-offset 1: q0, q1, q2 ..... q255 | |
* ..... | |
* read-offset N-1: q0, q1, q2 ..... q255 | |
*/ | |
; ZFILE fp | |
; | |
} ; | |
struct match_account | |
{ unsigned match_type /* n(one) g(lobal) p(refix), t(ail) */ | |
; double suffix_complexity /* for prefix match */ | |
; unsigned n_perfect /* head to tail length */ | |
; int nnn_offset | |
; int bbb_offset | |
; int aaa_offset | |
; int qqq_offset | |
; int dust_offset | |
; | |
} ; | |
struct param | |
{ unsigned cleanlength | |
#define VB_ALIGN 1 << 0 | |
#define VB_LINT 1 << 1 | |
#define VB_CLEAN 1 << 2 | |
; unsigned verbose | |
#define MODE_QC_EARLY 1 << 0 | |
#define MODE_QC_LATE 1 << 1 | |
#define MODE_QC_FULL 1 << 3 | |
#define MODE_FULLLENGTH 1 << 4 | |
#define MODE_KEEPALL 1 << 5 | |
#define MODE_DUSTPOLYA 1 << 6 | |
#define MODE_STREAMCLEAN 1 << 7 | |
; unsigned modes | |
; unsigned sample_lint | |
; unsigned sample_clean | |
/* we kludge/fudge this a bit. buck[buck_n] will also be utilised | |
* contrary to common conventions. It's an 'overflow' bucket | |
* for all reads that could not be classified in the barcode cases, | |
* or just all reads in the case where no barcode is present. | |
*/ | |
; struct bucket *buck | |
; int buck_n /* will be -1 initalised, so that loops 0 <= buck_n work */ | |
; unsigned p_3p_tth /* prefix match, minimal tail to head perfect match */ | |
; const char* fnmeta | |
; ZFILE fpclean | |
; ZFILE fplint | |
; int zippit | |
; int polya /* strip poly-As of at least this size */ | |
; unsigned taildust /* strip tail if dust-like criterion exceeds this */ | |
; unsigned taildustlate /* after adapter matching, strip tail if dust-like criterion exceeds this */ | |
; unsigned taildust_bases | |
; unsigned taildustlate_bases | |
; int trintco /* if != 0, remove reads with tri-nt score >= trintco */ | |
; int length_co | |
; int q_winlen | |
; int q_winofs | |
; int q_readofs /* skip first readofs bases */ | |
; int q_cutoff | |
; const char* adaptor_3p | |
; const char* sinsert_5p | |
; const char* adaptor_test /* experimental */ | |
; struct atabu the_tabu2 | |
; const char* basename | |
; struct sw_param swp_3p | |
; struct sw_param swp_5p | |
; struct sw_param swp_tabu | |
; struct sw_param swp_raptor /* reaper - adaptor testing */ | |
; struct match_requirement mr_3p_global /* used without clause */ | |
; struct match_requirement mr_3p_prefix /* only used if mr_minlen > 0 */ | |
; struct match_requirement mr_3p_barcode /* used without clause */ | |
; struct match_requirement mr_5p_require /* used without clause */ | |
; struct match_requirement mr_tabu /* used without clause */ | |
; struct match_requirement mr_5p_barcode | |
; double suffix_complexity_max /* in conjunction with prefix match */ | |
; double loco_check | |
; unsigned nnn_count | |
; unsigned nnn_outof | |
; int anchor_offset | |
; int length_limit | |
; const char* anchor_string | |
; unsigned kmer /* to analyse distance to adapter */ | |
; const char* format_clean | |
; const char* format_lint | |
; struct table metadata | |
; | |
} ; | |
void param_init | |
( struct param * pam | |
) /* 1. minlen | |
| 2. maxedit | |
| | 3. maxgap | |
| | | 4. offset | |
| | | | */ | |
{ struct match_requirement mr_3p_global = { 14, 2, 1, 0 } | |
; struct match_requirement mr_3p_prefix = { 8, 2, 0, 2 } | |
; struct match_requirement mr_3p_barcode= { 0, 6, 1, 0 } | |
; struct match_requirement mr_5p_require= { 0, 2, 1, 10} | |
; struct match_requirement mr_tabu = { 16, 2, 1, 0 } | |
; struct match_requirement mr_5p_barcode= { 0, 0, 0, 2 } /* bit field 1 = zip 2 = normal alignment */ | |
; struct atabu theboo = { "", 0, { 0 }, NULL } | |
; pam->cleanlength = 0 | |
; pam->verbose = 0 | |
; pam->modes = MODE_QC_FULL | |
; pam->sample_clean = 1 | |
; pam->sample_lint = 1 | |
; pam->polya = 0 | |
; pam->taildust = 0 | |
; pam->taildustlate = 0 | |
; pam->taildust_bases = 0 | |
; pam->taildustlate_bases = 0 | |
; pam->p_3p_tth = 0 /* strip perfect (read) tail to head (3p adaptor) match of any length >= 1 */ | |
; pam->mr_3p_global = mr_3p_global | |
; pam->mr_3p_prefix = mr_3p_prefix | |
; pam->mr_3p_barcode = mr_3p_barcode | |
; pam->mr_5p_require = mr_5p_require | |
; pam->mr_tabu = mr_tabu | |
; pam->mr_5p_barcode = mr_5p_barcode | |
; pam->buck = NULL | |
; pam->buck_n = -1 | |
; pam->fnmeta = NULL | |
; pam->adaptor_3p = NULL | |
; pam->adaptor_test = NULL | |
; pam->sinsert_5p = NULL | |
; pam->basename = "out" | |
; pam->trintco = 0 | |
; pam->length_co = 0 | |
; pam->q_winlen = 0 | |
; pam->q_winofs = 0 | |
; pam->q_readofs = 10 | |
; pam->q_cutoff = 0 | |
; pam->zippit = | |
#if WE_USE_ZLIB | |
1 | |
#else | |
0 | |
#endif | |
; pam->swp_3p.cost_gapleft = 3 | |
; pam->swp_3p.cost_gapright = 3 | |
; pam->swp_3p.cost_subst = 1 | |
; pam->swp_3p.gain_match = 4 | |
; pam->swp_3p.left_skip = 0 | |
; pam->swp_3p.right_limit = 0 | |
; pam->swp_3p.flags = 0 | |
; pam->swp_5p = pam->swp_3p | |
; pam->swp_tabu = pam->swp_3p | |
; pam->swp_raptor = pam->swp_3p | |
; pam->swp_raptor.cost_gapleft = 1 | |
; pam->swp_raptor.cost_gapright = 1 | |
; pam->swp_raptor.cost_subst = 1 | |
; pam->swp_raptor.gain_match = 2 | |
; pam->the_tabu2 = theboo | |
; pam->nnn_count = 0 | |
; pam->nnn_outof = 0 | |
; pam->anchor_offset = -1 | |
; pam->anchor_string = NULL | |
; pam->kmer = 10 | |
; pam->suffix_complexity_max = 0.25 | |
; pam->loco_check = 0.0 | |
; pam->format_clean = "@%I%n%C%n+%n%Q%n" | |
; pam->format_lint = "@%I%tmsg=%M%n%R%n+%n%Y%n" | |
; pam->length_limit = 0 | |
; pam->fpclean = NULL | |
; pam->fplint = NULL | |
; memset(&pam->metadata, 0, sizeof pam->metadata) | |
; } | |
unsigned get_mdta /* minimum distance to adapter, per k-mer varied along read */ | |
( char* buf_distance | |
, char* buf_offset | |
, const char* seq | |
, unsigned seq_n | |
, struct param* pam | |
) | |
{ const char* theaptr = pam->adaptor_test | |
; unsigned k = pam->kmer | |
; struct sw_param* swp = &pam->swp_raptor | |
; static unsigned char* mdta = NULL | |
; unsigned mask = 0 | |
; unsigned kmer = 0 | |
; char seq2[MAXFIELDSIZE+1] | |
; int i = 0 | |
; if (!mdta) | |
mdta = calloc(1 << 2*k, 1) | |
; buf_distance[0] = '\0' | |
; buf_offset[0] = '\0' | |
; strncpy(seq2, seq, MAXFIELDSIZE) | |
; if (!theaptr) | |
theaptr = pam->adaptor_3p | |
; if (!theaptr) /* give up */ | |
seq_n = 0 | |
; for (i=0; i < seq_n; i++) | |
{ int dist = 7, offset = 31 | |
; unsigned b = themap[(unsigned char) seq2[i]] | |
; mask = (mask << 1) & ((1 << k) -1) | |
; kmer = (kmer << 2) & ((1 << 2*k) -1) | |
; if (b > 3) | |
mask |= 1 | |
; else | |
kmer |= b | |
;if(0)fprintf(stderr, "[%s]\n", seq2+i) | |
; if (i+1 < k) | |
{ buf_distance[i] = '.' | |
; buf_distance[seq_n+i+1] = '.' | |
; continue | |
; } | |
/* store distance in 3 bits (0..7), offset in 5 bits (1..31) */ | |
if (mask || !mdta[kmer]) | |
{ SWNUM data[MATRIX_SIZE] = { 0 } | |
; struct sw_alninfo ai = { 0 } | |
; char bu = seq2[i+1] | |
; seq2[i+1] = '\0' /* sw_fill2 wants 0-terminated. blah. */ | |
; if | |
( !sw_fill2 | |
( &ai | |
, data | |
, MATRIX_SIZE | |
, seq2 + i+1-k | |
, theaptr | |
, swp->cost_gapleft + swp->cost_gapright | |
, swp | |
) ) | |
{ sw_trace3(&ai, swp, ai.max_ij, 0) /* do_barcode_3p */ | |
; dist = k - ai.n_match + ai.n_insr + ai.n_insl | |
; offset = ai.rgt_start | |
; } | |
if (dist > 7) dist = 7 | |
; if (offset > 31) offset = 31 | |
; if (!mask) | |
mdta[kmer] = dist | (offset << 3) | |
; seq2[i+1] = bu | |
; } | |
else | |
dist = mdta[kmer] & 7 | |
, offset = mdta[kmer] >> 3 | |
; buf_distance[i] = '0' + (dist > 9 ? 9 : dist) | |
; buf_distance[seq_n+i+1] = offset["0123456789ABCDEFGHIJKLMNOPQRSTUV"] | |
; } | |
; if (seq_n) | |
{ buf_distance[i] = '\n' | |
; buf_distance[2 * seq_n+1] = '\0' | |
; buf_distance[0] = 'D' | |
; buf_distance[seq_n+1] = 'O' | |
; } | |
else | |
{ memcpy(buf_distance, "D\nO\0", 4) | |
; return 3 | |
; } | |
return 2 * seq_n + 1 | |
; } | |
struct read_stats_global | |
{ unsigned N | |
; unsigned N_err /* globally set */ | |
; unsigned N_truncated /* */ | |
; unsigned N_alien /* not ACGTUNX */ | |
; unsigned N_cflr /* conflict resolved, per case */ | |
; unsigned N_clean /* per case */ | |
; unsigned N_bb /* globally set */ | |
; unsigned N_qq /* globally set */ | |
; unsigned N_anchor /* globally set */ | |
; unsigned N_loco /* globally set */ | |
; unsigned N_nomsg /* globally set */ | |
; unsigned N_cfl /* per case */ | |
; unsigned N_nn /* per case */ | |
; unsigned N_aa /* per case */ | |
; unsigned N_tri /* per case */ | |
; unsigned N_nomatch /* per case */ | |
; unsigned N_lenlo /* per case */ | |
; unsigned N_tabu /* per case */ | |
; unsigned N_5p_nosinsert /* per case */ /* This group should add up to */ | |
; unsigned N_lint /* per case */ /* to this */ | |
; unsigned long B /* */ | |
; unsigned long N_cells /* total size of alignment matrices */ | |
; unsigned B_BB /* bases lost due to BBB, globally set */ | |
; unsigned B_tri /* bases lost due to tri-tail trimming, globally set */ | |
; unsigned B_NN /* bases lost due to NNN, globally set */ | |
; unsigned B_AA /* bases lost due to AAA, (after adaptor stripping) */ | |
; unsigned B_QQ /* bases lost due to QQQ, (after adaptor stripping) */ | |
; unsigned B_lco /* bases lost due to user specified length cutoff */ | |
/* hierverder; implement below */ | |
; unsigned B_3pA /* bases lost due to 3pA stripping */ | |
; unsigned B_5pMM /* mismatches in required 5p adaptor */ | |
; | |
} ; | |
int cpytofield | |
( char* dest | |
, char* src | |
, int n | |
) | |
{ if (n > MAXFIELDSIZE) | |
n = MAXFIELDSIZE | |
, g_fieldtosmall++ | |
; memcpy(dest, src, n) | |
; dest[n] = '\0' | |
; return n | |
; } | |
struct record | |
{ char id [MAXFIELDSIZE+1] | |
; char seq [MAXFIELDSIZE+1] | |
; char q [MAXFIELDSIZE+1] | |
; char discard [MAXFIELDSIZE+1] | |
; char annot [MAXFIELDSIZE+1] | |
; unsigned id_n | |
; unsigned seq_n | |
; unsigned q_n | |
; unsigned discard_n | |
; unsigned annot_n | |
; int clean_n | |
; unsigned count | |
; const char* out_message | |
; struct read_stats_global rs /* NOTE maintains state between different read_record2 calls */ | |
; | |
} ; | |
void validate_sequence | |
( struct record* rec | |
) | |
{ int i | |
; for (i=0;i<rec->seq_n;i++) | |
{ rec->seq[i] = toupper((unsigned char) rec->seq[i]) | |
; if (BASEMAP((unsigned char) rec->seq[i]) > 4) | |
rec->rs.N_alien++ | |
, rec->seq[i] = 'N' | |
; } | |
} | |
#define RR_ERROR 3 | |
#define RR_NOMEM 2 | |
#define RR_DONE 1 | |
#define RR_OK 0 | |
/* At the moment this keeps reading in case of RL_NOMEM, | |
* but readline does not flush overly long lines yet, | |
* so there is not yet a mechanism to just ignore | |
* a record with lines that are too long. | |
* todo fixme | |
*/ | |
int read_record2 | |
( ZFILE ip | |
, const char* format | |
, struct record* rec | |
, int length_limit | |
) | |
{ unsigned char* a = (unsigned char*) format | |
; unsigned rlstat = 0 | |
; int q_received = 0 | |
; unsigned truncated = 0 | |
; int maxfieldsize = MAXFIELDSIZE | |
; if (length_limit && length_limit < MAXFIELDSIZE) | |
maxfieldsize = length_limit | |
; rec->id_n = 0 | |
; rec->seq_n = 0 | |
; rec->annot_n = 0 | |
; rec->discard_n = 0 | |
; rec->clean_n = 0 | |
; rec->q_n = 0 | |
; rec->count = 1 | |
; rec->id[0] = '\0' | |
; rec->seq[0] = '\0' | |
; rec->discard[0]= '\0' | |
; rec->q[0] = '\0' | |
; rec->annot[0] = '\0' | |
; while (a[0]) | |
{ char* receive = NULL | |
; unsigned *np_receive = 0 | |
; switch(a[0]) | |
{ case 'I': receive = rec->id; np_receive = &rec->id_n; break | |
; case 'R': receive = rec->seq; np_receive = &rec->seq_n; break | |
; case 'D': receive = rec->discard; np_receive = &rec->discard_n; break | |
; case 'A': receive = rec->annot; np_receive = &rec->annot_n; break | |
; case 'Q': receive = rec->q; np_receive = &rec->q_n; q_received = 0; break | |
; } | |
rlstat |= kraken_readline(ip, receive, maxfieldsize, np_receive, &truncated) | |
; if (a[0] == 'R') | |
rec->rs.N_truncated += truncated | |
; if (rlstat & (RL_DONE | RL_ERROR)) | |
break | |
; a++ | |
; } | |
if (rlstat & (RL_DONE | RL_ERROR)) | |
{ if (a != (unsigned char*) format && a[0]) | |
argh("reaper", "last record was incomplete") | |
; return RR_DONE | |
; } | |
if (rlstat & RL_NOMEM) /* fixme not checked yet */ | |
return RR_NOMEM | |
; rec->rs.N++ | |
; validate_sequence(rec) | |
; if (q_received && rec->q_n != rec->seq_n) | |
{ rec->rs.N_err++ | |
; rec->q_n = 0 | |
; rec->q[0] = '\0' | |
; } | |
return RR_OK | |
; } | |
/* | |
; case 'I': receive = rec->id; np_receive = &rec->id_n; break | |
; case 'R': receive = rec->seq; np_receive = &rec->seq_n; break | |
; case 'D': receive = rec->discard; np_receive = &rec->discard_n; break | |
; case 'Q': receive = rec->q; np_receive = &rec->q_n; q_received = 0; break | |
*/ | |
#define izblank(c) ((unsigned char) (c) == ' ' || (unsigned char) (c) == '\t') | |
int read_recordx | |
( ZFILE ip | |
, const char* format | |
, struct record* rec | |
) | |
{ const char* fmtp = format, *fmtz = format + strlen(format) | |
; unsigned rlstat = 0 | |
#define THE_BUFSIZE 8191 | |
; char buf[THE_BUFSIZE+1] | |
; unsigned n_received = 0, n_lines = 0 | |
; unsigned truncated = 0 | |
; char* bufp = NULL, *curp = NULL, *bufz = NULL | |
; int esc = 0 | |
; rec->id_n = 0 | |
; rec->seq_n = 0 | |
; rec->discard_n = 0 | |
; rec->q_n = 0 | |
; rec->clean_n = 0 | |
; rec->count = 1 | |
; rec->id[0] = '\0' | |
; rec->seq[0] = '\0' | |
; rec->discard[0]= '\0' | |
; rec->q[0] = '\0' | |
; while (fmtp < fmtz) | |
{ if (!bufp) | |
{ rlstat |= kraken_readline(ip, buf, THE_BUFSIZE, &n_received, &truncated) | |
; if (truncated || rlstat & (RL_DONE | RL_ERROR)) | |
break | |
; bufp = buf | |
; bufz = buf + n_received | |
; n_lines++ | |
; esc = 0 | |
; } | |
curp = bufp | |
; if (esc) | |
{ switch((unsigned char) fmtp[0]) | |
{ case '#': | |
fmtp++ | |
; bufp = NULL | |
; continue | |
; | |
case 'n': | |
if (bufp != bufz) | |
goto DONE | |
; fmtp++ | |
; bufp = NULL | |
; continue | |
; | |
case '%': | |
if (bufp[0] != '%') goto DONE | |
; bufp++ | |
; break | |
; | |
case 't': | |
if (bufp[0] != '\t') goto DONE | |
; bufp++ | |
; break | |
; | |
case 's': | |
if (bufp[0] != ' ') goto DONE | |
; bufp++ | |
; break | |
; | |
case '.': | |
bufp++ | |
; break | |
; | |
case 'b': | |
while (bufp < bufz && izblank(bufp[0])) | |
bufp++ | |
; break | |
; | |
case 'C': case 'X': | |
{ int n_scanned = 0 | |
; if (sscanf(bufp, "%u%n", &rec->count, &n_scanned) < 1) | |
goto DONE | |
; bufp += n_scanned | |
; } | |
break | |
; | |
case 'G': | |
while (bufp < bufz && !izblank((unsigned char) bufp[0])) | |
bufp++ | |
; rec->discard_n = cpytofield(rec->discard, curp, bufp -curp) | |
; break | |
; | |
case 'F': | |
while (bufp < bufz && '\t' != (unsigned char) bufp[0]) | |
bufp++ | |
; rec->discard_n = cpytofield(rec->discard, curp, bufp -curp) | |
; break | |
; | |
case 'H': | |
while (bufp < bufz && (unsigned char) bufp[0] != (unsigned char) fmtp[1]) | |
bufp++ | |
; if (bufp == bufz) | |
goto DONE | |
; rec->discard_n = cpytofield(rec->discard, curp, bufp -curp) | |
; fmtp++ | |
; bufp++ | |
; break | |
; | |
case 'a': | |
rec->annot_n = cpytofield(rec->annot, rec->discard, rec->discard_n) | |
; break | |
; | |
case 'A': | |
while (izblank((unsigned char) curp[0])) | |
curp++ | |
; bufp = curp | |
; while (bufp < bufz) | |
bufp++ | |
; rec->annot_n = cpytofield(rec->annot, curp, bufp -curp) | |
; break | |
; | |
case 'I': | |
while (bufp < bufz && !izblank((unsigned char) bufp[0])) | |
bufp++ | |
; rec->id_n = cpytofield(rec->id, curp, bufp -curp) | |
; break | |
; | |
case 'Q': | |
while (bufp < bufz && !izblank((unsigned char) bufp[0])) | |
bufp++ | |
; rec->q_n = cpytofield(rec->q, curp, bufp -curp) | |
; break | |
; | |
case 'R': | |
while (bufp < bufz && isalpha((unsigned char) bufp[0])) | |
bufp++ | |
; rec->seq_n = cpytofield(rec->seq, curp, bufp -curp) | |
; validate_sequence(rec) | |
; break | |
; | |
default: | |
goto DONE | |
; } | |
esc = 0 | |
; } | |
else | |
{ if (fmtp[0] == '%') | |
esc = 1 | |
; else if (bufp[0] != fmtp[0]) | |
goto DONE | |
; else | |
bufp++ | |
; } | |
fmtp++ | |
; } | |
DONE : | |
if (truncated) | |
argh("reaper", "line too long") | |
, rlstat |= RL_ERROR | |
; if (n_lines && (fmtp[0] || (bufp && bufp - buf != n_received))) | |
{ argh("reaper", "parse error (remaining format string %s, buffer [%s])", fmtp, bufp) | |
; return RR_ERROR | |
; } | |
if (rlstat & RL_DONE) | |
return RR_DONE | |
; if (rlstat & RL_ERROR) | |
return RR_ERROR | |
; if (rlstat & RL_NOMEM) /* fixme not checked yet */ | |
return RR_NOMEM | |
; rec->rs.N++ | |
; return RR_OK | |
; } | |
/* | |
/stringoffsets/ | |
========================================================= | |
alnspace | |
alignment space: | |
0 1 2 3 4 5 6 7 8 9 10 coordinates | |
G G G A T C C T A G read | |
C T A G G C C adaptor | |
left_start : 7 | |
lint_offset : left_start - 1 == 6 == clean_n | |
========================================================= | |
string space: | |
0 1 2 3 4 5 6 7 8 9 10 coordinates | |
G G G A N C N T N G read | |
^ N offset | |
N offset is 4, we return 5, so that | |
lint_offset == NNN_offset - 1 == 4 == clean_n | |
========================================================= */ | |
int get_nnn_offset | |
( const char* s | |
, unsigned n | |
, unsigned count | |
, unsigned outof | |
) | |
{ unsigned hist[SW_ALN_MAXSIZE] = { 0 } | |
; if (!outof || count > outof) | |
return -1 | |
; int i, nn = 0 | |
; for (i=0;i<n;i++) | |
{ unsigned char a = s[i] | |
; unsigned char e = hist[i % outof] | |
; hist[i % outof] = a | |
; nn += (a == 'N') - (e == 'N') | |
; if (nn >= count) | |
{ while (nn > 0) /* now check which ones are not N */ | |
{ if (hist[i-- % outof] == 'N') | |
nn-- | |
; } | |
return i+2 /* noteme, alignment space coordinate | |
* i+1 is the string-space NNNN offset, | |
* but we need to return offset in aligment space (see above) | |
*/ | |
; } | |
} | |
return -1 | |
; } | |
/* same return semantics as get_nnn_offset */ | |
void get_dust_info | |
( const char* s | |
, unsigned n | |
, int* maxp /* dust score for tail */ | |
, int* maxidp /* offset where this is attained */ | |
, int* dominant_base /* dominant base */ | |
, int* dominant_base_count /* and its count */ | |
) | |
{ int max = 0, maxid =0, i | |
; unsigned count[6] = { 0 } | |
; dustscore_tail(s, n, NULL, &max, &maxid) | |
; maxp[0] = max | |
; maxidp[0] = maxid | |
; dominant_base[0] = 4 | |
; dominant_base_count[0] = 0 | |
; for (i=maxid; i<n; i++) | |
count[themap[(unsigned char) s[i]]]++ | |
; for (i=0;i<4;i++) | |
if (count[i] > dominant_base_count[0]) | |
dominant_base[0] = i | |
, dominant_base_count[0] = count[i] | |
; } | |
/* same return semantics as get_nnn_offset */ | |
int get_dust_offset | |
( const char* s | |
, unsigned n | |
, unsigned threshold | |
, unsigned bases | |
) | |
{ int max = 0, maxid =0, found = 0 | |
; dustscore_tail(s, n, NULL, &max, &maxid) | |
; found = threshold && max >= threshold && maxid+6 <= n | |
; if (found && bases) | |
{ int i | |
; unsigned count[6] = { 0 } | |
; for (i=maxid; i<n; i++) /* fixme: i=0 should that be i=maxid ? */ | |
count[themap[(unsigned char) s[i]]]++ | |
; for (i=0;i<4;i++) | |
{ if (count[i] * 2 > (n - maxid) && (bases & (1 << i))) | |
break | |
; } | |
found = i < 4 | |
; } | |
return found ? (maxid+1) : -1 | |
; } | |
/* same return semantics as get_nnn_offset */ | |
int get_polya_offset | |
( const char* s | |
, unsigned n | |
, int minlen | |
) | |
{ int i = n /* Note: should work for q_n == 0 */ | |
; int polya = 0 | |
; while (--i >= 0 && s[i] == 'A') | |
polya++ | |
; return minlen && polya >= minlen ? n + 1 - polya : -1 | |
; } | |
int cmp_int | |
( const void* a | |
, const void* b | |
) | |
{ return ((int*)a)[0] - ((int*)b)[0] | |
; } | |
int get_qqq_offset | |
( struct record* rec | |
, int cut /* cut when median drops below this */ | |
, unsigned winlen | |
, unsigned winofs | |
, unsigned readofs | |
) | |
{ unsigned char hist[256] = { 0 } | |
; int window[255] /* signed so that we can use a-b for comparison */ | |
; unsigned winmid | |
; int i = 0, n_lt = 0, n_gt = 0 | |
; int median = 0 | |
; int lastqual = 0 | |
; if (!winlen || readofs + winlen > rec->q_n) | |
return -1 | |
; lastqual = rec->q[rec->q_n-1] | |
; winmid = (winlen+1) / 2 /* e.g. 5 -> 3; n_smaller + n_equal must be >= winmid | |
* and also n_larger + n_equal must be >= winmid | |
*/ | |
; for (i=readofs;i<winlen+readofs; i++) | |
window[i % winlen] = rec->q[i] | |
; qsort(window, winlen, sizeof window[0], cmp_int) | |
; median = window[winmid-1] | |
; for (i=readofs;i<winlen+readofs; i++) | |
{ unsigned i_out = i % winlen | |
; window[i_out] = rec->q[i] | |
; hist[(unsigned char) window[i_out]]++ | |
; n_lt += window[i_out] < median | |
; n_gt += window[i_out] > median | |
; } | |
i = readofs + winlen - 1 /* necessary for median >= cut logic */ | |
; while (median >= cut && ++i < rec->q_n + winlen) | |
{ unsigned i_out = i % winlen | |
; unsigned rgt = i < rec->q_n ? rec->q[i] : lastqual /* shift this value in */ | |
; unsigned lft = window[i_out] /* shift this value out */ | |
; window[i_out] = rgt | |
; hist[rgt]++ | |
; hist[lft]-- | |
; if (rgt < median) n_lt++ | |
; if (rgt > median) n_gt++ | |
; if (lft < median) n_lt-- | |
; if (lft > median) n_gt-- | |
; while (!hist[median] || n_lt + hist[median] < winmid) | |
{ n_lt += hist[median] | |
; n_gt -= hist[median+1] | |
; median++ | |
; } | |
while (!hist[median] || n_gt + hist[median] < winmid) | |
{ n_gt += hist[median] | |
; n_lt -= hist[median-1] | |
; median-- | |
; } | |
;if(0)fprintf(stderr, " %2d-%2d-%d", ji median, ji rgt, ji i); | |
} | |
;if(0)fputc('\n', stderr); | |
{ int j = (i+1+winofs)-winlen | |
; if (j > rec->q_n) | |
j = rec->q_n | |
; if (i < (rec->q_n + winlen)) | |
return j | |
; } | |
return -1 | |
; } | |
int get_bbb_offset | |
( struct record* rec | |
) | |
{ int i = rec->q_n /* Note: should work for q_n == 0 */ | |
; while (--i >= 0) | |
{ if (rec->q[i] != 'B') | |
break | |
; } | |
return i+1 == rec->q_n ? -1 : i+2 /* same return semantics as get_nnn_offset */ | |
; } | |
double string_complexity | |
( const char* a | |
, unsigned n | |
) | |
{ int i, n_pairs = 0 | |
; if (!n) | |
return 1.0 | |
; for (i=0;i+1<n;i++) | |
n_pairs += a[i] == a[i+1] | |
; return (1.0 - n_pairs * 1.0 / n) | |
; } | |
#if 0 | |
/* fixme: n_stretch in callers often computed using SW_ALN_MAXSIZE, but trimming changes this */ | |
/* returns number of matches if parameters satisfied, 0 otherwise */ | |
int alignment_edit_ok | |
( const unsigned char* aln | |
, unsigned aln_length | |
, unsigned n_stretch | |
, unsigned n_edit_max | |
, unsigned n_gap_max | |
) | |
{ int i, n_edit = 0, n_gap = 0, n_subst = 0, n_match = 0, ok = 0 | |
; if (!n_stretch) | |
return 0 | |
; unsigned hist[SW_ALN_MAXSIZE] = { 0 } | |
;if(0)fprintf(stderr, "length %d n_stretch %d e %d\n", (int) aln_length , (int) n_stretch , (int) n_edit_max) | |
; for (i=0;i<aln_length;i++) | |
{ unsigned char a = aln[i] | |
; unsigned char e = hist[i % n_stretch] | |
; hist[i % n_stretch] = a | |
; n_subst += (a == 'x') - (e == 'x') | |
; n_gap += (a == '-') - (e == '-') | |
, n_edit = n_subst + n_gap | |
; n_match += a == '|' | |
; if ( n_edit <= n_edit_max | |
&& i+1 + n_edit_max >= n_stretch + n_edit | |
&& n_gap <= n_gap_max | |
) | |
ok = 1 | |
; } | |
return ok ? n_match : 0 | |
; } | |
#endif | |
/* the prefix match is complicated. | |
* We must be able to find a good match at the end | |
* that's not beaten by a longer shoddy match at start | |
* for which the suffix complexity criterion fails. This | |
* probably, ideally, requires using the last-row max_ej cell. | |
* The max_ej cell will not, however, find prefix matches | |
* followed by low complexity reads. | |
*/ | |
int sw_match_end | |
( struct sw_alninfo* ai | |
, struct match_account* ma | |
, struct sw_param* swp | |
, unsigned prefix_perfect_minlen | |
, struct match_requirement* mr_prefix | |
) | |
{ unsigned n_gap = ai->n_insl + ai->n_insr | |
; unsigned n_edit = n_gap + ai->n_subst | |
; unsigned prefix_length = ai->rgt_end + 1 - ai->rgt_start | |
; ma->suffix_complexity = 0.0 | |
/* This code uses the globally highest scoring alignment, | |
* and applies the prefix match requirement to the entire | |
* alignment, not just a subpart. | |
* This is so that a case such as | |
GATAGATCGAAACCATCCTCTGCTAGTTTTC-GTATG : [16,36] | |
||x|x|||-x||x|||-|xx|| : score 56 | |
TCGTATGC-CGTCTTCTGCTTGT : [1,21] 0/0 recno=1 | |
* is not considered a match. | |
*/ | |
; unsigned prefix_match | |
= ( (!mr_prefix->mr_offset || (ai->rgt_start <= mr_prefix->mr_offset)) | |
&& prefix_length >= mr_prefix->mr_minlen | |
&& n_edit <= mr_prefix->mr_maxedit | |
&& n_gap <= mr_prefix->mr_maxgap | |
) | |
/* now we just inspect the end. */ | |
; if (!prefix_match && prefix_perfect_minlen) | |
{ int ofs = (ai->ni-1) * ai->nj /* this sits on the left column of zeros, last entry */ | |
; unsigned j, bestj = 0 | |
; for (j=prefix_perfect_minlen;j<ai->nj;j++) | |
{ if (ai->data[ofs+j] == j * swp->gain_match) | |
bestj = j | |
; } | |
ma->n_perfect = bestj | |
; return bestj ? 1 : 0 | |
; } | |
/* NOTE. The prefix match may fall just short of the read, | |
* for example miss the last one or two bases. The suffix | |
* complexity will still be small, as a result of | |
* prefix_length (the matching part of the adaptor) | |
* being part of the formula | |
*/ | |
if (prefix_match) | |
{ if (ai->lft_end + 1 < ai->ni) | |
{ double left_trail = ai->ni - ai->lft_end - 1 | |
; double c = string_complexity(ai->left+ai->lft_end+1, ai->ni - ai->lft_end-1) | |
; ma->suffix_complexity = c * (left_trail) / (1.0 * prefix_length + left_trail) | |
; } | |
return 1 | |
; } | |
return 0 | |
; } | |
/* barcode code */ | |
int sw_match_bc3p | |
( struct sw_alninfo* ai | |
, struct match_requirement* mr_global | |
, struct match_requirement* mr_prefix | |
, struct match_requirement* mr_barcode | |
, int sinsert_n /* number of bases before the barcode, not yet used. */ | |
, int barcode_n /* length of barcode, not yet used. */ | |
, int* n_match | |
) | |
{ int n_length = mr_global->mr_minlen | |
; int n_edit_max = mr_global->mr_maxedit | |
; int n_gap_max = mr_global->mr_maxgap | |
; n_match[0] = 0 | |
; | |
/* /barcodehardcoded/ | |
* note: 3p-sinsert+barcode start hardcoded to be in pos 1 or 2 | |
* (beware of alignment space, which uses offset 1) | |
* Now we have to be careful wrt to the barcode match requirements; | |
* How do we locate the right spot in the alignment? | |
*/ | |
if (ai->rgt_start != 1 && ai->rgt_start != 2) | |
{ if(0)fprintf(stderr, "bail-out 2 (%d)\n", (int) ai->rgt_start) | |
; return 0 | |
; } | |
/* reduce global range alignment length parameter if match is at end of range */ | |
if (ai->ni - ai->lft_start < n_length && mr_prefix->mr_minlen) | |
{ n_length = ai->ni - ai->lft_start | |
; if (n_length < mr_prefix->mr_minlen) | |
return 0 | |
; n_edit_max = mr_prefix->mr_maxedit | |
; n_gap_max = mr_prefix->mr_maxgap | |
; } | |
return | |
( ( !mr_barcode->mr_minlen /* check barcode range alignment if specied */ | |
|| sw_alignment_edit_ok | |
( ai->aln_aln + ai->aln_ofs | |
/* fixme, better control over barcode */ | |
, mr_barcode->mr_minlen /* fixed length search: can this overrun the alignment? document! */ | |
, mr_barcode->mr_minlen | |
, mr_barcode->mr_maxedit | |
, mr_barcode->mr_maxgap | |
) | |
) | |
&& | |
/* second argument, alignment length: this is not | |
* strictly correct, but the remainder of the | |
* alignment is zero-filled. Note: our search | |
* space (right_limit in sw_fill2) has been | |
* restricted in do_barcode_3p. | |
*/ | |
#define WOTCHA 0 | |
/* check global range alignment */ | |
( n_match[0] | |
= sw_alignment_edit_ok | |
( ai->aln_aln + ai->aln_ofs | |
, ai->aln_end - ai->aln_ofs | |
, n_length | |
, n_edit_max | |
, n_gap_max | |
) ) | |
) | |
; } | |
/* aln_ofs == SW_ALN_MAXSIZE + 1 */ | |
int sw_match_any | |
( struct sw_alninfo* ai | |
, struct match_requirement* mr | |
) | |
{ | |
return | |
( !mr->mr_offset | |
|| (mr->mr_offset > 0 && ai->rgt_start <= mr->mr_offset) | |
|| (mr->mr_offset < 0 && (ai->nj - ai->rgt_end) <= -mr->mr_offset) | |
) | |
&& sw_alignment_edit_ok | |
( ai->aln_aln + ai->aln_ofs | |
, ai->aln_end - ai->aln_ofs | |
, mr->mr_minlen | |
, mr->mr_maxedit | |
, mr->mr_maxgap | |
) | |
; } | |
int sw_printaccount | |
( const struct match_account* ma | |
, char* buf | |
, unsigned bufsize | |
) | |
{ return | |
snprintf | |
( buf | |
, bufsize | |
, "mt=%c,sc=%.2f,ht=%d,nn=%d,bb=%d,aa=%d,qq=%d" | |
, (int) ma->match_type | |
, ma->suffix_complexity | |
, (int) ma->n_perfect | |
, (int) ma->nnn_offset | |
, (int) ma->bbb_offset | |
, (int) ma->aaa_offset | |
, (int) ma->qqq_offset | |
) | |
; } | |
int sw_printaln3 | |
( const struct sw_alninfo* ai | |
, char* buf | |
, unsigned bufsize | |
) | |
{ unsigned o = ai->aln_ofs | |
; return | |
snprintf | |
( buf | |
, bufsize | |
, "ls=%d,le=%d,rs=%d,re=%d,score=%d,left=%s,aln=%s,right=%s" | |
, (int) ai->lft_start | |
, (int) ai->lft_end | |
, (int) ai->rgt_start | |
, (int) ai->rgt_end | |
, (int) ai->data[ai->max_ij] | |
, ai->aln_lft+o | |
, ai->aln_aln+o | |
, ai->aln_rgt+o | |
) | |
; } | |
void dump_read | |
( int cleaned | |
, void* fpo | |
, const struct record* rec | |
, const struct sw_alninfo* ai | |
, const struct match_account* ma | |
, struct param* pam | |
) | |
#define THEBUFSIZE 1023 | |
{ char buf[THEBUFSIZE+1] | |
; char buf2[THEBUFSIZE+1] | |
#if WE_USE_ZLIB | |
; int zippit = pam->zippit | |
#endif | |
; const char* format = cleaned ? pam->format_clean : pam->format_lint | |
; const char* f = format | |
; const char* z = format + strlen(format) | |
; char dust [MAXFIELDSIZE+1] | |
; int escape = 0, tnt | |
; int dustidx = 0, dustscore = 0 | |
; while (f < z) | |
{ unsigned c = (unsigned char) f[0] | |
; const char* p = buf | |
; int n = 0 | |
; if (!escape) | |
{ if (c == '%') | |
escape = 1 | |
; else | |
buf[0] = c | |
, n = 1 | |
; } | |
else | |
{ int cleanlen = cleaned ? rec->clean_n : rec->seq_n | |
; switch(c) | |
{ case 'R': p = rec->seq; n = rec->seq_n; | |
break; | |
; case 'C': case 'E': p = rec->seq; n = rec->clean_n; | |
if (c == 'E' && n == 0) { p = "N"; n = 1; } | |
break; | |
; case 'V': revcompl(rec->seq, rec->clean_n, buf); p = buf; n = rec->clean_n; | |
break; | |
; case 'Z': { int themax = rec->seq_n; | |
if (pam->length_co > themax) themax = pam->length_co; | |
n = snprintf(buf, THEBUFSIZE+1, "%.*s", (int) rec->clean_n, rec->seq); | |
while (n < themax && n < THEBUFSIZE) | |
buf[n++] = 'N'; | |
buf[n] = '\0'; | |
break; | |
} | |
; case 'I': p = rec->id; n = rec->id_n; break; | |
; case 'L': n = snprintf(buf, THEBUFSIZE+1, "%d", (int) rec->clean_n); break; | |
; case 'X': n = snprintf(buf, THEBUFSIZE+1, "%u", (unsigned) rec->count); break; | |
; case 'Q': p = rec->q; n = cleanlen; break; | |
; case 'Y': p = rec->q; n = rec->q_n; break; | |
; case 'U': { int score, score_offset, base, base_count | |
; int seqlen = cleanlen | |
; get_dust_info(rec->seq, seqlen, &score, &score_offset, &base, &base_count) | |
; n = snprintf(buf, THEBUFSIZE+1, "score=%d,offset=%d,len=%d,base=%c,count=%d", | |
score, score_offset+1, seqlen - score_offset, dna[base], base_count) | |
; } | |
; break | |
; case 'T': tnt = trintscore(rec->seq, cleanlen) | |
; n = snprintf(buf, THEBUFSIZE+1, "%d", tnt) | |
; break | |
; case 'D': dustscore_tail(rec->seq, rec->clean_n, dust, &dustscore, &dustidx) | |
; p = dust; n = rec->clean_n; break | |
; case '_': if (!dustscore) | |
dustscore_tail(rec->seq, rec->seq_n, NULL, &dustscore, &dustidx) | |
; n = snprintf(buf, THEBUFSIZE+1, "%d:%d", dustidx+1, dustscore); | |
break | |
; case 'M': p = rec->out_message; n = strlen(p); break | |
; case 'i': case 'J': n = snprintf(buf, THEBUFSIZE+1, "%d", (int) rec->rs.N); break; | |
; case 'f': n = snprintf(buf, THEBUFSIZE+1, "%d", (int) (4 * (rec->rs.N-1) + 2)); break; | |
; case '3': if (ai) n = sw_printaln3(ai, buf, THEBUFSIZE+1); break | |
; case 'A': p = rec->annot; n = rec->annot_n; break | |
; case '?': n = get_mdta(buf, buf2, rec->seq, rec->seq_n, pam); break | |
; case '=': if (ma) n = sw_printaccount(ma, buf, THEBUFSIZE+1); break | |
; case 'n': | |
case 't': | |
case 's': | |
case '%': buf[0] = c == 'n' ? '\n' : c == 't' ? '\t' : c == 's' ? ' ' : '%' | |
; n = 1 | |
; break | |
; case 'q': { int a = (unsigned char) f[1]; | |
int themax = cleanlen; | |
if (pam->length_co > themax) themax = pam->length_co; | |
f++; | |
n = snprintf(buf, THEBUFSIZE+1, "%.*s", (int) rec->clean_n, rec->q); | |
while (n < themax && n < THEBUFSIZE) | |
buf[n++] = a; | |
buf[n] = '\0'; | |
break; | |
} | |
; } | |
escape = 0 | |
; } | |
if (n > 0) | |
{ | |
#if WE_USE_ZLIB | |
if (zippit) gzwrite(fpo, p, n) | |
; else fwrite(p, n, 1, fpo) | |
#else | |
fwrite(p, n, 1, fpo) | |
#endif | |
; } | |
f++ | |
; } | |
} | |
#undef THEBUFSIZE | |
/* low complexity */ | |
int loco | |
( struct record* rec | |
, struct param* pam | |
, double loco_check | |
) | |
{ if (string_complexity(rec->seq, rec->seq_n) < loco_check) | |
{ rec->out_message = "loco" | |
; dump_read(0, pam->fplint, rec, NULL, NULL, pam) | |
; return 1 | |
; } | |
return 0 | |
; } | |
/* hierverder segfault if buckid == buck_n or if maxfieldsize == MAXFIELDSIZE. | |
should be plenty, no? | |
*/ | |
void qc_tally_in | |
( struct record* rec | |
, struct param* pam | |
, int buckid | |
, int N | |
) | |
{ int j | |
; for (j=0;j<N;j++) | |
{ switch(rec->seq[j]) | |
{ case 'A': pam->buck[buckid].qc_cpsin.A[j]++; break | |
; case 'C': pam->buck[buckid].qc_cpsin.C[j]++; break | |
; case 'G': pam->buck[buckid].qc_cpsin.G[j]++; break | |
; case 'T': pam->buck[buckid].qc_cpsin.T[j]++; break | |
; case 'N': pam->buck[buckid].qc_cpsin.N[j]++; break | |
; } | |
if (rec->q_n) | |
{ int kk = MAXQUALITY * j + ((unsigned char) rec->q[j]) | |
;if(0)fprintf(stderr, "testing2 recid=%d buckid=%d qid=%d nt=%d val=%d rootp=%p kk=%d seq=%s\n" | |
, (int) rec->rs.N | |
, buckid | |
, (int) (7 & (unsigned char) rec->q[j]) | |
, (int) j | |
, 0 | |
, (void*) (pam->buck[buckid].qc_bcq) | |
, kk | |
, rec->seq | |
) | |
; pam->buck[buckid].qc_bcq[kk] += 1 /* one block is one read-offset */ | |
/* [ nt0 size MAXQUALITY ] [ nt1 size MAXQUALITY] */ | |
; } | |
} | |
if (N > pam->buck[buckid].global_in_max_size) | |
pam->buck[buckid].global_in_max_size = N | |
; } | |
void qc_tally_out | |
( struct record* rec | |
, struct param* pam | |
, int buckid | |
, int N | |
) | |
{ int j | |
; pam->buck[buckid].qc_lenout[N]++ | |
; for (j=0;j<N;j++) | |
{ switch(rec->seq[j]) | |
{ case 'A': pam->buck[buckid].qc_cpsout.A[j]++; break | |
; case 'C': pam->buck[buckid].qc_cpsout.C[j]++; break | |
; case 'G': pam->buck[buckid].qc_cpsout.G[j]++; break | |
; case 'T': pam->buck[buckid].qc_cpsout.T[j]++; break | |
; case 'N': pam->buck[buckid].qc_cpsout.N[j]++; break | |
; } | |
} | |
if (N > pam->buck[buckid].global_out_max_size) | |
pam->buck[buckid].global_out_max_size = N | |
; } | |
void qc_brief | |
( struct param *pam | |
, struct record *rec | |
, enum program_mode mode | |
) | |
{ argh | |
( "reaper" | |
, "check %d errors, %d reads truncated, %d clean, %d lint, %d total" | |
, (int) rec->rs.N_err | |
, (int) rec->rs.N_truncated | |
, (int) rec->rs.N_clean | |
, (int) rec->rs.N_lint | |
, (int) rec->rs.N | |
) | |
; if (rec->rs.N_alien) | |
argh | |
( "reaper" | |
, "found %d characters outside [ACGTUNX], average %.1f per read" | |
, rec->rs.N_alien | |
, rec->rs.N_alien * 1.0 / rec->rs.N | |
) | |
; if (pam->buck_n > 1) | |
{ int i, j | |
; for (i=0;i<pam->buck_n;i++) | |
{ dim sum = 0 | |
; const char* bc = pam->buck[i].barcode5p | |
; if (!bc) | |
bc = pam->buck[i].barcode3p | |
; for (j=0; j<MAXFIELDSIZE; j++) | |
sum += pam->buck[i].qc_lenout[j] | |
; argh("reaper", "barcode [%s] has %lu clean reads", bc, (long unsigned) sum) | |
; } | |
} | |
} | |
/* this is the quality control report function. mat-happy! */ | |
void qc_report | |
( struct param* pam | |
, struct record* rec | |
, enum program_mode mode | |
) | |
{ int i, j, buckid | |
; for (buckid=0;buckid<=pam->buck_n;buckid++) | |
{ struct bucket* buck = pam->buck+buckid | |
; const char* stage2 = "clean" | |
; const char* thetag = buck->filetag | |
; if (!thetag) | |
thetag = buck->barcode3p | |
; if (!thetag) /* hack, but OK */ | |
thetag = buck->barcode5p | |
; if (!thetag) /* hack hack hack */ | |
thetag = "lane" | |
/* todo: qc_cpsin and qc_cpsout need to keep track of maximum read lengths */ | |
/* qc hack again */ | |
; if (buckid == pam->buck_n && (mode == CASE_3P_BC || mode == CASE_5P_BC)) | |
thetag = "total" | |
, stage2 = "miss" | |
; char* fn_pre_nt = stringle("%s.%s.report.input.nt", pam->basename, thetag) | |
; char* fn_pre_q = stringle("%s.%s.report.input.q", pam->basename, thetag) | |
; char* fn_post_nt = stringle("%s.%s.report.%s.nt", pam->basename, thetag, stage2) | |
; char* fn_post_q = stringle("%s.%s.report.%s.q", pam->basename, thetag, stage2) | |
; char* fn_post_len= stringle("%s.%s.report.%s.len", pam->basename, thetag, stage2) | |
; FILE* fp_pre_nt, *fp_pre_q | |
,* fp_post_nt, *fp_post_len | |
; dim *cpsout[5] | |
= { buck->qc_cpsout.A | |
, buck->qc_cpsout.C | |
, buck->qc_cpsout.G | |
, buck->qc_cpsout.T | |
, buck->qc_cpsout.N | |
} | |
, *cpsin[5] | |
= { buck->qc_cpsin.A | |
, buck->qc_cpsin.C | |
, buck->qc_cpsin.G | |
, buck->qc_cpsin.T | |
, buck->qc_cpsin.N | |
} | |
; if ((fp_pre_nt = myfopen(fn_pre_nt, "w", 0))) | |
{ fprintf(fp_pre_nt, "pos\tA\tC\tG\tT\tN\n") | |
; for (j=0;j<buck->global_in_max_size;j++) | |
fprintf | |
( fp_pre_nt | |
, "%d\t%d\t%d\t%d\t%d\t%d\n" | |
, (int) (j+1) | |
, (int) cpsin[0][j] | |
, (int) cpsin[1][j] | |
, (int) cpsin[2][j] | |
, (int) cpsin[3][j] | |
, (int) cpsin[4][j] | |
) | |
; myfzclose(fp_pre_nt, 0) | |
; } | |
if ((fp_post_nt = myfopen(fn_post_nt, "w", 0))) | |
{ fprintf(fp_post_nt, "pos\tA\tC\tG\tT\tN\n") | |
; for (j=0;j<buck->global_out_max_size;j++) | |
fprintf | |
( fp_post_nt | |
, "%d\t%d\t%d\t%d\t%d\t%d\n" | |
, (int) (j+1) | |
, (int) cpsout[0][j] | |
, (int) cpsout[1][j] | |
, (int) cpsout[2][j] | |
, (int) cpsout[3][j] | |
, (int) cpsout[4][j] | |
) | |
; myfzclose(fp_post_nt, 0) | |
; } | |
if ((fp_post_len = myfopen(fn_post_len, "w", 0))) | |
{ fprintf(fp_post_len, "length\tcount\n") | |
; for (i=0;i<=buck->global_out_max_size;i++) | |
fprintf | |
( fp_post_len | |
, "%d\t%d\n" | |
, (int) i | |
, (int) pam->buck[buckid].qc_lenout[i] | |
) | |
; myfzclose(fp_post_len, 0) | |
; } | |
if ((fp_pre_q = myfopen(fn_pre_q, "w", 0))) | |
{ fprintf(fp_pre_q, "pos\tq0\tq10\tq25\tq50\tq75\tq90\tq100\n") | |
; for (i=0;i<pam->buck[buckid].global_in_max_size;i++) | |
{ dim total = 0, total2 = 0, j /* fixme overflow */ | |
; dim q0=0, q10=0, q25=0, q50=0, q75=0, q90=0, q100=0 | |
; for (j=1;j<MAXQUALITY;j++) /* offset 1 because of q0 check */ | |
total += buck->qc_bcq[i * MAXQUALITY + j] | |
; for (j=1;j<MAXQUALITY;j++) | |
{ int kk = i * MAXQUALITY + j | |
; total2 += buck->qc_bcq[kk] | |
; if (!q0 && total * 0.0 < total2) | |
q0 = j | |
; if (!q10 && total * 0.10 <= total2) | |
q10 = j | |
; if (!q25 && total * 0.25 <= total2) | |
q25 = j | |
; if (!q50 && total * 0.50 <= total2) | |
q50 = j | |
; if (!q75 && total * 0.75 <= total2) | |
q75 = j | |
; if (!q90 && total * 0.90 <= total2) | |
q90 = j | |
; if (!q100 && total * 0.9999 <= total2) | |
q100 = j | |
; } | |
fprintf | |
( fp_pre_q, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n" | |
, (int) (i+1) | |
, (int) q0 | |
, (int) q10 | |
, (int) q25 | |
, (int) q50 | |
, (int) q75 | |
, (int) q90 | |
, (int) q100 | |
) | |
; } | |
myfzclose(fp_pre_q, 0) | |
; } | |
free(fn_pre_nt) | |
; free(fn_pre_q) | |
; free(fn_post_nt) | |
; free(fn_post_q) | |
; free(fn_post_len) | |
; } | |
{ char* fn_post_tally= stringle("%s.sumstat", pam->basename) | |
; FILE* fp_post_tally | |
; if ((fp_post_tally = myfopen(fn_post_tally, "w", 0))) | |
{ fprintf(fp_post_tally, "discarded_no_barcode_match=%u\n", rec->rs.N_nomatch) | |
; fprintf(fp_post_tally, "discarded_tabu_contamination=%u\n", rec->rs.N_tabu) | |
; fprintf(fp_post_tally, "discarded_no_5p_sinsert=%u\n", rec->rs.N_5p_nosinsert) | |
; fprintf(fp_post_tally, "discarded_no_anchor=%u\n", rec->rs.N_anchor) | |
; fprintf(fp_post_tally, "discarded_NNN=%u\n", rec->rs.N_nn) | |
; fprintf(fp_post_tally, "discarded_BBB=%u\n", rec->rs.N_bb) | |
; fprintf(fp_post_tally, "discarded_AAA=%u\n", rec->rs.N_aa) | |
; fprintf(fp_post_tally, "discarded_QQQ=%u\n", rec->rs.N_qq) | |
; fprintf(fp_post_tally, "discarded_tri=%u\n", rec->rs.N_tri) | |
; fprintf(fp_post_tally, "discarded_length_cutoff=%u\n", rec->rs.N_lenlo) | |
; fprintf(fp_post_tally, "discarded_barcode_conflict=%u\n", rec->rs.N_cfl) | |
; fprintf(fp_post_tally, "discarded_low_complexity=%u\n", rec->rs.N_loco) | |
; fprintf(fp_post_tally, "no_message=%u\n", rec->rs.N_nomsg) | |
; fprintf(fp_post_tally, "total_discarded=%u\n", rec->rs.N_lint) | |
; fprintf(fp_post_tally, "total_error=%u\n", rec->rs.N_err) | |
; fprintf(fp_post_tally, "total_accepted=%u\n", rec->rs.N_clean) | |
; fprintf(fp_post_tally, "total_input=%u\n", rec->rs.N) | |
; fprintf(fp_post_tally, "conflicts_resolved=%u\n", rec->rs.N_cflr) | |
; fprintf(fp_post_tally, "bases_removed_by_quality=%u\n", rec->rs.B_QQ) | |
; fprintf(fp_post_tally, "bases_removed_by_qualityb=%u\n", rec->rs.B_BB) | |
; fprintf(fp_post_tally, "bases_removed_by_tritail=%u\n", rec->rs.B_tri) | |
; fprintf(fp_post_tally, "bases_removed_by_nnn=%u\n", rec->rs.B_NN) | |
; fprintf(fp_post_tally, "bases_removed_by_polya=%u\n", rec->rs.B_AA) | |
; myfzclose(fp_post_tally, 0) | |
; free(fn_post_tally) | |
; } | |
} | |
} | |
int buckets_prepare_qc | |
( struct bucket* buck | |
, int buck_n | |
, enum program_mode mode | |
) | |
{ int i | |
; for (i=0;i<=buck_n;i++) | |
{ buck[i].cache_lint_offset = 0 | |
; buck[i].cache_score = 0 | |
; buck[i].cache_accept = 0 | |
; memset(buck[i].qc_cpsin.A, 0, sizeof buck[i].qc_cpsin.A) | |
; memset(buck[i].qc_cpsin.C, 0, sizeof buck[i].qc_cpsin.C) | |
; memset(buck[i].qc_cpsin.G, 0, sizeof buck[i].qc_cpsin.G) | |
; memset(buck[i].qc_cpsin.T, 0, sizeof buck[i].qc_cpsin.T) | |
; memset(buck[i].qc_cpsin.N, 0, sizeof buck[i].qc_cpsin.N) | |
; memset(buck[i].qc_cpsout.A, 0, sizeof buck[i].qc_cpsout.A) | |
; memset(buck[i].qc_cpsout.C, 0, sizeof buck[i].qc_cpsout.C) | |
; memset(buck[i].qc_cpsout.G, 0, sizeof buck[i].qc_cpsout.G) | |
; memset(buck[i].qc_cpsout.T, 0, sizeof buck[i].qc_cpsout.T) | |
; memset(buck[i].qc_cpsout.N, 0, sizeof buck[i].qc_cpsout.N) | |
; memset(buck[i].qc_lenout, 0, sizeof buck[i].qc_lenout) | |
/* fixme memrelease */ | |
; // if (!(buck[i].qc_bcq = calloc(1 + (MAXQUALITY * MAXFIELDSIZE), sizeof(dim)))) | |
; if (!(buck[i].qc_bcq = calloc((MAXQUALITY * MAXFIELDSIZE) , sizeof(dim)))) | |
X_FINISHED("cannot allocate memory for quality tracking") | |
; buck[i].global_in_max_size = 0 | |
; buck[i].global_out_max_size = 0 | |
; buck[i].barcode_start = 0 | |
; buck[i].barcode_end = 0 | |
; } | |
/* fixme very ugly; init code only for last "special" bucket */ | |
/* rip and refactor at some point */ | |
buck[buck_n].barcode3p = NULL | |
; buck[buck_n].adaptor3p = "the-3-prime-adaptor" | |
; buck[buck_n].sinsert3p = NULL | |
; buck[buck_n].barcode5p = NULL | |
; buck[buck_n].sinsert5p = NULL | |
; buck[buck_n].cat3p = NULL | |
; buck[buck_n].filetag = NULL | |
; return 0 | |
; } | |
void data_release | |
( struct param* pam | |
) | |
{ int i | |
; if (pam->buck) | |
{ for (i=0;i<=pam->buck_n;i++) | |
{ struct bucket* buck = pam->buck+i | |
; struct atabu* boo = &buck->tabu2, *boop = NULL /* boop'revious */ | |
; int n = 0 | |
; while (boo) /* hierverder: corrupt if meta file cannot be opened! */ | |
{ if (boo->tabuseq) | |
free(boo->tabuseq) | |
; boop = boo->next | |
; if (n++) | |
free(boo) | |
; boo = boop | |
; } | |
if (buck->qc_bcq) free(buck->qc_bcq) | |
; if (buck->fname) free(buck->fname) | |
; } | |
free(pam->buck) | |
; } | |
pam->buck = NULL | |
; pam->buck_n = -1 | |
; table_release(&pam->metadata) | |
; } | |
int barcodes_io_open | |
( struct bucket* buck | |
, const char* basename | |
, int buck_n | |
, int zippit | |
) | |
{ int i | |
; for (i=0;i<buck_n;i++) | |
{ const char* thetag = buck[i].filetag | |
; if (!thetag) | |
thetag = buck[i].barcode3p | |
; if (!thetag) | |
thetag = buck[i].barcode5p | |
; buck[i].fname = stringle("%s.%s.%s", basename, thetag, zippit ? "clean.gz" : "clean") | |
; if (!(buck[i].fp = myfopen(buck[i].fname, ZWMODE, zippit))) | |
{ argh("reaper", "cannot open file %s", buck[i].fname) | |
; return 1 | |
; } | |
} | |
return 0 | |
; } | |
int barcodes_io_close | |
( struct bucket* buck | |
, int buck_n | |
, int zippit | |
) | |
{ int i | |
; for (i=0;i<buck_n;i++) | |
myfzclose(buck[i].fp, zippit) | |
; return 0 | |
; } | |
/* fixme, could have seqp and qp pointers */ | |
void rec_remove_prefix | |
( struct record* rec | |
, int rm | |
) | |
{ if (rm > rec->seq_n) | |
rm = rec->seq_n | |
; memmove(rec->seq, rec->seq+rm, rec->seq_n - rm) | |
; rec->seq_n -= rm | |
; rec->seq[rec->seq_n] = '\0' | |
; if (rec->q_n) | |
memmove(rec->q, rec->q+rm, rec->q_n - rm) | |
, rec->q_n -= rm | |
, rec->q[rec->q_n] = '\0' | |
; } | |
int column_require | |
( int i | |
, const char* str | |
) | |
{ if (i < 0) | |
X_FINISHED("need column %s", str) | |
; return 0 | |
; } | |
/* fixme: seem to have multiple places with barcode instantiation. | |
* deduplicate. | |
*/ | |
int adaptors_set_case_3p | |
( struct bucket* buck | |
, int* buck_n | |
, struct param* pam | |
) | |
{ buck[0].barcode5p = NULL | |
; buck[0].barcode3p = NULL | |
; buck[0].sinsert3p = NULL | |
; buck[0].cat3p = NULL | |
; buck[0].sinsert5p = pam->sinsert_5p | |
; buck[0].adaptor3p = pam->adaptor_3p | |
; buck[0].tabu2 = pam->the_tabu2 | |
; if (g_mode == CASE_NOBC && !pam->adaptor_3p) | |
X_FINISHED("-geom 3p need a 3p adaptor") | |
; if | |
( g_restrict > 0 | |
&& strlen(pam->adaptor_3p) > g_restrict | |
/* fixme: old code had | |
&& strlen(pam->adaptor_3p) < g_restrict | |
and led to different results sometimes. | |
*/ | |
) | |
((char*) buck[0].adaptor3p)[g_restrict] = '\0' | |
;fprintf(stderr, "adapter %s\n", buck[0].adaptor3p) | |
; buck_n[0] = 1 | |
; return 0 | |
; } | |
/* ugly, designtabu */ | |
void parse_tabu | |
( struct atabu* boo | |
, const char* s | |
, int extent | |
) | |
{ struct match_requirement mr_tabu = { 16, 2, 1, 0 } | |
; const char* tok1 = NULL | |
; do | |
{ char* s2 = NULL | |
; tok1 = strchr(s, ',') | |
; s2 = stringle("%.*s", (int) (tok1 ? (tok1 - s) : strlen(s)), s) | |
; boo->tabuseq = s2 | |
; boo->tabulen = strlen(s2) | |
; boo->mr_tabu = mr_tabu | |
; boo->next = NULL | |
; if (extent && boo->tabulen > extent) | |
boo->tabulen = extent | |
, boo->tabuseq[extent] = '\0' | |
; if (tok1) | |
{ boo->next = myalloc(sizeof boo[0]) | |
; boo = boo->next | |
; s = tok1 + 1 | |
; } | |
} | |
while(tok1) | |
; return | |
; } | |
int buckets_alloc | |
( struct param* pam | |
, int n | |
) | |
{ if (n > 64) | |
argh("reaper", "that is a lot of barcodes, that is. %d of'em", n) | |
; if (!(pam->buck = calloc((n+1), sizeof pam->buck[0]))) | |
X_FINISHED("unable to allocate data for %d barcodes", n) | |
; pam->buck_n = n | |
; return 0 | |
; } | |
int config_read | |
( const char* fn | |
, struct param* pam | |
, enum program_mode mode | |
, int extent | |
) | |
{ int id_barcode , doc_barcode = 0 | |
; int id_3p_ad , doc_3p_ad = 1 | |
; int id_3p_si , doc_3p_si = 2 | |
; int id_5p_si , doc_5p_si = 3 | |
; int id_tabu , doc_tabu = 4 | |
; int id_filetag , doc_filetag = 5 | |
; int n_columns_found | |
; int n_barcodes = 0 | |
; int barcodes_length = 0 | |
; struct table* thetable = &pam->metadata | |
; ZFILE ip | |
; struct bucket* buck = NULL | |
; if (!(ip = myfopen(fn, ZRMODE, 1))) | |
return 1 | |
; const char* dochandle[6] = { "barcode", "3p-ad", "3p-si", "5p-si", "tabu", "filetag" } | |
; X_CASCADE_CLEAN(table_read(ip, thetable, 's', TABLE_WITH_COLNAMES), myfzclose(ip, 1)) | |
; X_CASCADE(buckets_alloc(pam, thetable->n_rows)) | |
; buck = pam->buck | |
; id_barcode = table_column_index(thetable, dochandle[doc_barcode]) | |
; id_3p_ad = table_column_index(thetable, dochandle[doc_3p_ad]) | |
; id_3p_si = table_column_index(thetable, dochandle[doc_3p_si]) | |
; id_5p_si = table_column_index(thetable, dochandle[doc_5p_si]) | |
; id_tabu = table_column_index(thetable, dochandle[doc_tabu]) | |
; id_filetag = table_column_index(thetable, dochandle[doc_filetag]) | |
; n_columns_found | |
= ( id_3p_ad >= 0 ) | |
+ ( id_barcode >= 0 ) | |
+ ( id_3p_si >= 0 ) | |
+ ( id_5p_si >= 0 ) | |
+ ( id_tabu >= 0 ) | |
+ ( id_filetag >= 0 ) | |
; switch(mode) | |
{ case CASE_NOBC | |
: { X_CASCADE(column_require(id_3p_ad, dochandle[doc_3p_ad])) | |
; X_CASCADE(column_require(id_tabu, dochandle[doc_tabu])) | |
; if (n_columns_found > 2) | |
argh("reaper", "only using columns %s and %s", dochandle[doc_3p_ad], dochandle[doc_tabu]) | |
; break | |
; } | |
case CASE_3P_BC | |
: { X_CASCADE(column_require(id_3p_si, dochandle[doc_3p_si])) | |
; X_CASCADE(column_require(id_barcode, dochandle[doc_barcode])) | |
; X_CASCADE(column_require(id_tabu, dochandle[doc_tabu])) | |
; X_CASCADE(column_require(id_3p_ad, dochandle[doc_3p_ad])) | |
; if (n_columns_found > 4) | |
argh("reaper", "only using columns %s, %s and %s", dochandle[doc_barcode], dochandle[doc_3p_si], dochandle[doc_tabu]) | |
; break | |
; } | |
case CASE_5P_BC | |
: { X_CASCADE(column_require(id_barcode, dochandle[doc_barcode])) | |
; X_CASCADE(column_require(id_5p_si, dochandle[doc_5p_si])) | |
; X_CASCADE(column_require(id_3p_ad, dochandle[doc_3p_ad])) | |
; X_CASCADE(column_require(id_tabu, dochandle[doc_tabu])) | |
; if (n_columns_found > 4) | |
argh | |
( "reaper" | |
, "only using columns %s, %s, %s and %s" | |
, dochandle[doc_barcode] | |
, dochandle[doc_5p_si] | |
, dochandle[id_3p_ad], dochandle[doc_tabu] | |
) | |
; break | |
; } | |
default | |
: ; | |
} | |
; for (n_barcodes=0; n_barcodes < thetable->n_rows; n_barcodes++) | |
{ int n = n_barcodes | |
; buck[n].barcode5p = NULL | |
; buck[n].sinsert5p = NULL | |
; buck[n].sinsert5p_n = 0 | |
; buck[n].barcode3p = NULL | |
; buck[n].sinsert3p = NULL | |
; buck[n].adaptor3p = NULL | |
; buck[n].sinsert3p_n = 0 | |
; buck[n].cat3p = NULL | |
; buck[n].filetag = NULL | |
; buck[n].barcode_n = 0 | |
#define hte(s) (strcmp(s, "-") ? s : "") /* hyphen-to-empty */ | |
; if (id_filetag >= 0) | |
buck[n].filetag = hte(table_elem_string(thetable, n, id_filetag)) | |
; if (mode == CASE_NOBC) | |
{ buck[n].adaptor3p = hte(table_elem_string(thetable, n, id_3p_ad)) | |
; parse_tabu(&buck[n].tabu2, hte(table_elem_string(thetable, n, id_tabu)), extent) | |
; if (id_5p_si >= 0) | |
buck[n].sinsert5p = hte(table_elem_string(thetable, n, id_5p_si)) | |
, buck[n].sinsert5p_n = strlen(buck[n].sinsert5p) /* fixme do length assignments in unified place */ | |
; } | |
else if (mode == CASE_3P_BC) | |
{ buck[n].barcode3p = table_elem_string(thetable, n, id_barcode) | |
; buck[n].adaptor3p = hte(table_elem_string(thetable, n, id_3p_ad)) | |
; buck[n].sinsert3p = hte(table_elem_string(thetable, n, id_3p_si)) | |
; parse_tabu(&buck[n].tabu2, hte(table_elem_string(thetable, n, id_tabu)), extent) | |
; buck[n].sinsert3p_n = strlen(buck[n].sinsert3p) | |
; buck[n].barcode_n = strlen(buck[n].barcode3p) | |
; buck[n].cat3p = stringle("%s%s%s", buck[n].sinsert3p, buck[n].barcode3p, buck[n].adaptor3p) | |
; if (buck[n].sinsert3p_n > 1) | |
X_FINISHED("3p sequence insert of size > 1 not yet supported; ask Stijn") | |
; if (n == 0) | |
barcodes_length = strlen(buck[n].barcode3p) | |
; else if (barcodes_length != strlen(buck[n].barcode3p)) | |
argh("reaper", "barcodes with different lengths") | |
/* X_FINISHED("barcodes with different lengths") */ | |
; } | |
else if (mode == CASE_5P_BC) | |
{ buck[n].barcode5p = table_elem_string(thetable, n, id_barcode) | |
; buck[n].sinsert5p = hte(table_elem_string(thetable, n, id_5p_si)) | |
; buck[n].adaptor3p = hte(table_elem_string(thetable, n, id_3p_ad)) | |
; parse_tabu(&buck[n].tabu2, hte(table_elem_string(thetable, n, id_tabu)), extent) | |
; buck[n].sinsert5p_n = strlen(buck[n].sinsert5p) | |
; buck[n].barcode_n = strlen(buck[n].barcode5p) | |
; if (n == 0) | |
barcodes_length = strlen(buck[n].barcode5p) | |
; else if (barcodes_length != strlen(buck[n].barcode5p)) | |
argh("reaper", "barcodes with different lengths") | |
/* X_FINISHED("barcodes with different lengths") */ | |
; } | |
if (extent > 0) | |
{ if (buck[n].adaptor3p && strlen(buck[n].adaptor3p) > extent) | |
((char*) buck[n].adaptor3p)[extent] = '\0' | |
; } | |
argh | |
( "reaper" | |
, "scanned line %d sinsert3p [%s] barcode3p [%s] adaptor3p [%s] cat3p [%s]" | |
" barcode5p [%s] sinsert5p [%s] tabu [%s]" | |
, (int) (n+1) | |
, buck[n].sinsert3p ? buck[n].sinsert3p : "NA" | |
, buck[n].barcode3p ? buck[n].barcode3p : "NA" | |
, buck[n].adaptor3p ? buck[n].adaptor3p : "NA" | |
, buck[n].cat3p ? buck[n].cat3p : "NA" | |
, buck[n].barcode5p ? buck[n].barcode5p : "NA" | |
, buck[n].sinsert5p ? buck[n].sinsert5p : "NA" | |
, buck[n].tabu2.tabuseq ? buck[n].tabu2.tabuseq : "" | |
) | |
; { struct atabu* boo = buck[n].tabu2.next | |
; while (boo) | |
{ fprintf(stderr, " [%s]", boo->tabuseq) | |
; boo = boo->next | |
; } | |
} | |
} | |
; return 0 | |
; } | |
/* is the below comment up to date? */ | |
/* The second condition is because we only fill part of the alignment matrix. | |
* in the equality case the matrix will be emtpy, and sw_trace3 will crash. | |
* The easiest way is to check here (BUT, fixme nevertheless; sw_trace3 should | |
* do better; it's probably crashing because max_ij is never increased and starts at 0). | |
* Note in barcode mode we need at least the barcode (plus a little bit more, | |
* likely), so an initial read of length cleanlength (before aligning) | |
* is never going to make it. | |
*/ | |
int check_for_tabu | |
( struct record* rec | |
, unsigned len | |
, struct param* pam | |
, struct atabu* boo | |
) | |
{ struct sw_alninfo ai = { 0 } | |
; char* s = rec->seq | |
; char* q = rec->q | |
; int indel_allowed = pam->mr_tabu.mr_maxgap > 0 | |
; char c_s = s[len] /* hack/kludge; note that len is user specified */ | |
; char c_q = q[len] /* hack/kludge NOTE harmless in case q_n == 0 */ | |
; int found_alignment = 0 | |
; SWNUM data[MATRIX_SIZE] = { 0 } | |
; if | |
( !boo | |
|| !boo->tabulen | |
|| !pam->mr_tabu.mr_minlen | |
) | |
return 0 | |
; s[len] = '\0' | |
; q[len] = '\0' | |
; do | |
{ if (sw_fill2(&ai, data, MATRIX_SIZE, s, boo->tabuseq, indel_allowed, &pam->swp_tabu)) | |
X_IGNORE("sw error in check_for_tabu") | |
; rec->rs.N_cells += ai.n_cells_used | |
; sw_trace3(&ai, &pam->swp_tabu, ai.max_ij, SW_TRIM) /* check_for_tabu */ | |
; if (sw_match_any(&ai, &pam->mr_tabu)) | |
{ found_alignment = 1 | |
; break | |
; } | |
boo = boo->next | |
; } | |
while (boo && boo->tabulen) | |
/* in check_for_tabu, fplint; this one printed before read */ | |
; if | |
( pam->verbose & VB_ALIGN | |
&& rec->rs.N % pam->sample_lint == 0 | |
) | |
sw_pp2("read", x_tabu[found_alignment], -1, &ai, pam->fplint, pam->zippit, rec->rs.N, NULL) | |
/* in check_for_tabu */ | |
; s[len] = c_s /* hack/kludge */ | |
; q[len] = c_q /* hack/kludge */ | |
; return found_alignment | |
; } | |
/* note: mr_minlen not used! */ | |
/* note: sinsert is left in the alignment */ | |
int sw_match_5p_required | |
( struct sw_alninfo* ai | |
, int adaptor_length | |
, struct match_requirement* mr | |
) | |
{ int n_edit = ai->n_insl + ai->n_insr + ai->n_subst | |
; int n_border_left = THEMAX(ai->lft_start, ai->rgt_start) - 1 | |
; int n_border_right = adaptor_length - ai->lft_end | |
; int n_border = n_border_left + n_border_right | |
;if(0)fprintf(stderr, "border-left: %d border-right: %d\n", n_border_left, n_border_right) | |
; return | |
( ai->n_match + n_edit + n_border >= adaptor_length /* cover entire adapter */ | |
&& n_edit + n_border <= mr->mr_maxedit /* edit requriement */ | |
&& ai->n_insl + ai->n_insr <= mr->mr_maxgap /* gap requirement */ | |
) | |
? 1 | |
: 0 | |
; } | |
/* only called by ntnt */ | |
int check_5p_sinsert | |
( struct record* rec | |
, struct param* pam | |
, ZFILE fpclean /* clean file is barcode specific */ | |
, const char** msgp | |
) | |
{ int keepall = pam->modes & MODE_KEEPALL | |
; if | |
( !pam->sinsert_5p | |
|| !strlen(pam->sinsert_5p) | |
) | |
return 0 | |
;if(0)fprintf(stderr, "insert %s\n", pam->sinsert_5p) | |
; pam->swp_5p.right_limit = strlen(pam->sinsert_5p) + pam->mr_5p_require.mr_offset | |
; { struct sw_alninfo ai = { 0 } | |
; SWNUM data[MATRIX_SIZE] = { 0 } | |
; int adaptor_length = strlen(pam->sinsert_5p) | |
; int indel_allowed = pam->mr_5p_require.mr_maxgap > 0 | |
; int found_alignment = 0 | |
; if (sw_fill2(&ai, data, MATRIX_SIZE, pam->sinsert_5p, rec->seq, indel_allowed, &pam->swp_5p)) | |
X_FINISHED_VAL(-1, "sw error in check_5p_sinsert") | |
; rec->rs.N_cells += ai.ni * ai.nj | |
; sw_trace3 | |
( &ai | |
, &pam->swp_5p | |
, ai.max_ij | |
, indel_allowed ? SW_TRIM : 0 | |
) /* check_5p_sinsert */ | |
; found_alignment = sw_match_5p_required(&ai, adaptor_length, &pam->mr_5p_require) | |
;if(0) | |
fprintf(stderr, "%s %s %d\n", pam->sinsert_5p, rec->seq, found_alignment) | |
,sw_pp2("read", x_5psi[found_alignment], -1, &ai, stdout, 0, rec->rs.N, NULL) | |
/* in check_5p_sinsert, fpclean, fplint; caller ntnt returns immediately */ | |
; if | |
( pam->verbose & VB_ALIGN | |
&& rec->rs.N % pam->sample_lint == 0 | |
) | |
sw_pp2("read", x_5psi[found_alignment], -1, &ai, pam->fplint, pam->zippit, rec->rs.N, NULL) | |
/* in check_5p_sinsert */ | |
; if (found_alignment) | |
return ai.rgt_end + (ai.ni-1-ai.lft_end) /* ai.ni-1 is the length of left sequence */ | |
; else | |
{ if (keepall) | |
{ msgp[0] = "5p-sinsert" | |
; return rec->seq_n | |
; } | |
rec->rs.N_5p_nosinsert++ | |
; rec->rs.N_lint++ | |
; rec->out_message = "5p-absent" | |
; dump_read(0, pam->fplint, rec, NULL, NULL, pam) | |
; return -1 | |
; } | |
} | |
return 0 | |
; } | |
/* fixme; what if the input read has length smaller than cleanlength and no | |
* adaptor or tabu is found? detected? | |
*/ | |
int ntnt /* nucleotide nip and tuck */ | |
( struct record* rec | |
, struct param* pam /* lint file is taken from pam, non-specific */ | |
, ZFILE fpclean /* clean file is barcode specific */ | |
) | |
{ int clean = 0, lint_offset = 1, p_5p_offset = 0 | |
; int indel_allowed = pam->mr_3p_global.mr_maxgap > 0 | |
; int keepall = pam->modes & MODE_KEEPALL | |
; const char* msg = "clean" | |
; int found_alignment = 0 | |
; struct sw_alninfo ai = { 0 } | |
; struct match_account ma = { 'n', 0, 0, -1, -1 } | |
; struct sw_param myswp = pam->swp_3p | |
; SWNUM data[MATRIX_SIZE] = { 0 } | |
/* returns the offset of the first good bit */ | |
/* updates rec->rs */ | |
; p_5p_offset = check_5p_sinsert(rec, pam, fpclean, &msg) | |
; if (p_5p_offset < 0) | |
return 1 | |
; else if (p_5p_offset > 0) | |
rec_remove_prefix(rec, p_5p_offset) | |
; if (g_guard > 0) | |
myswp.left_skip = g_guard | |
/* hierverder: if empty read, everything below is still acted | |
* upon; do we hit any of those conditions? | |
*/ | |
/* CASE_NOBC may or may not set pam->adaptor_3p and pam->tabu2 | |
* fixme check. | |
*/ | |
; if (pam->adaptor_3p) | |
{ if | |
( sw_fill2 | |
( &ai | |
, data | |
, MATRIX_SIZE | |
, rec->seq | |
, pam->adaptor_3p | |
, indel_allowed | |
, &myswp | |
) ) | |
X_FINISHED("sw error in nip_and_tuck") | |
; rec->rs.N_cells += ai.ni * ai.nj | |
; sw_trace3(&ai, &myswp, ai.max_ij, SW_TRIMLEFT) /* decideme: why not SW_TRIMRIGHT? */ | |
; } | |
ma.nnn_offset = get_nnn_offset(ai.left, ai.ni-1, pam->nnn_count, pam->nnn_outof) | |
; ma.qqq_offset = get_qqq_offset(rec, pam->q_cutoff, pam->q_winlen, pam->q_winofs, pam->q_readofs) | |
; ma.bbb_offset = pam->modes & MODE_QC_LATE ? get_bbb_offset(rec) : -1 | |
; ma.aaa_offset = get_polya_offset(ai.left, ai.ni-1, pam->polya) | |
; ma.dust_offset= get_dust_offset(ai.left, ai.ni-1, pam->taildust, pam->taildust_bases) | |
/* NOTE everything in alignment space is offset one relative to string | |
* space (C's normal 0-based indexing). In alignment space the | |
* alignment starts at ai.lft_start, so the good bit ends at | |
* ai.lft_start-1. In string space that's at ai.lft_start-2. The | |
* length of the good bit is thus ai.lft_start - 1: that's what we | |
* need to compare with pam->cleanlength. | |
* NNN space is just string-based, so lint_offset is not adjusted when | |
* set to ma.nnn_offset. The same holds for BBB space. | |
* Finally, if 0 == pam->cleanlength, then we should always accept the | |
* read, even though the length cutoff comparison might *still* fail | |
* due to the ai.rgt_start offsetting -- we may have a clean | |
* readlength of -2 if for instance 3 == rgt_start. | |
*/ | |
; if (pam->adaptor_3p) | |
found_alignment = sw_match_any(&ai, &pam->mr_3p_global) | |
; if | |
( pam->adaptor_3p[0] /* fixme/docme */ | |
&& pam->verbose & VB_ALIGN | |
&& rec->rs.N % pam->sample_lint == 0 | |
) | |
sw_pp2("read", x_3pgb[found_alignment], -1, &ai, pam->fplint, pam->zippit, rec->rs.N, NULL) | |
; if (found_alignment) | |
{ ma.match_type = 'g' | |
; clean = (ai.lft_start + 1) > pam->cleanlength + ai.rgt_start | |
; msg = "len-global" | |
; if (clean) | |
lint_offset = (ai.lft_start + 1) - ai.rgt_start | |
; else if (!pam->cleanlength && ai.lft_start < ai.rgt_start) | |
clean = 1 | |
, lint_offset = 1 | |
; else | |
rec->rs.N_lenlo++ | |
; | |
} | |
/* below, first use the overal best alignment. | |
* This may find a prefix match followed by low complexity sequence. | |
* Then look at the tail of the read only. | |
*/ | |
else if (pam->adaptor_3p && pam->mr_3p_prefix.mr_minlen) | |
{ unsigned z1 = sw_match_end(&ai, &ma, &myswp, pam->p_3p_tth, &pam->mr_3p_prefix) | |
;if(0)fprintf(stderr, "z1 %d\n", (int) z1) | |
; found_alignment = z1 && ma.suffix_complexity < pam->suffix_complexity_max | |
; if (!found_alignment) | |
{ sw_trace3(&ai, &myswp, ai.max_ej, SW_TRIMLEFT) /* DON'T trim right; it needs to stay anchored */ | |
; found_alignment = sw_match_end(&ai, &ma, &myswp, pam->p_3p_tth, &pam->mr_3p_prefix) | |
; if | |
( pam->adaptor_3p[0] /* fixme/docme */ | |
&& pam->verbose & VB_ALIGN | |
&& rec->rs.N % pam->sample_lint == 0 | |
) | |
sw_pp2("read", x_3ppx[found_alignment], -1, &ai, pam->fplint, pam->zippit, rec->rs.N, NULL) | |
;if(0)fprintf(stderr, "found_alignment %d\n", (int) found_alignment) | |
; } | |
if (found_alignment) | |
{ if (ma.n_perfect) | |
clean = ai.ni > pam->cleanlength + ma.n_perfect | |
, ma.match_type = 't' | |
; else | |
clean = (ai.lft_start + 1) > pam->cleanlength + ai.rgt_start | |
, ma.match_type = 'p' | |
; msg = "len-suffix" | |
; if (clean) | |
{ lint_offset | |
= ma.n_perfect | |
? ai.ni - ma.n_perfect | |
: (ai.lft_start + 1) - ai.rgt_start | |
; } | |
else if (!pam->cleanlength && ai.lft_start < ai.rgt_start) | |
{ clean = 1 | |
; lint_offset = 1 | |
; } | |
else | |
rec->rs.N_lenlo++ | |
; } | |
else if (ai.ni > pam->cleanlength) | |
clean = 1 | |
, lint_offset = ai.ni | |
; } | |
else | |
clean = 1 | |
, lint_offset = rec->seq_n + 1 | |
/* This may happen due to the correction for right start | |
* (the alignment can start at base 2 in the adapter, | |
* base 0 in the read). | |
* Set to 1 by way of defensive programming. | |
*/ | |
; if (!lint_offset) | |
lint_offset = 1 | |
/* fixme: funcify */ | |
; if (clean && ma.nnn_offset > 0) | |
{ rec->rs.B_NN += rec->seq_n+1 - ma.nnn_offset | |
; if (ma.nnn_offset <= pam->cleanlength) | |
{ clean = 0 | |
; rec->rs.N_nn++ | |
; msg = "NNN" | |
; } | |
else if (ma.nnn_offset < lint_offset) | |
lint_offset = ma.nnn_offset | |
; } | |
/* fixme: funcify */ | |
if (clean && ma.qqq_offset > 0) | |
{ rec->rs.B_QQ += rec->seq_n+1 - ma.qqq_offset | |
; if (ma.qqq_offset <= pam->cleanlength) | |
{ clean = 0 | |
; rec->rs.N_qq++ | |
; msg = "lowmq" | |
; } | |
else if (ma.qqq_offset < lint_offset) | |
lint_offset = ma.qqq_offset | |
; } | |
/* ach, require 3 bases, why not */ | |
if (pam->taildustlate && clean && lint_offset > 3) | |
{ int dust_offset = get_dust_offset(ai.left, lint_offset-1, pam->taildustlate, pam->taildustlate_bases) | |
; if (dust_offset >= 0) | |
{ if (ma.dust_offset < 0 || dust_offset < ma.dust_offset) | |
ma.dust_offset = dust_offset | |
; } | |
} | |
/* fixme: funcify */ | |
if (clean && ma.dust_offset > 0) | |
{ rec->rs.B_tri += rec->seq_n+1 - ma.dust_offset | |
; if (ma.dust_offset <= pam->cleanlength) | |
{ clean = 0 | |
; rec->rs.N_tri++ | |
; msg = "tritail" | |
; } | |
else if (ma.dust_offset < lint_offset) | |
lint_offset = ma.dust_offset | |
; } | |
/* fixme: funcify */ | |
if (clean && ma.bbb_offset > 0) | |
{ rec->rs.B_BB += rec->seq_n+1 - ma.bbb_offset | |
; if (ma.bbb_offset <= pam->cleanlength) | |
{ clean = 0 | |
; rec->rs.N_bb++ | |
; msg = "lowq" | |
; } | |
else if (ma.bbb_offset < lint_offset) | |
lint_offset = ma.bbb_offset | |
; } | |
/* fixme: funcify | |
* hierverder: is B_AA incremenent OK? (doublecounting) | |
*/ | |
if (clean && ma.aaa_offset > 0) | |
{ rec->rs.B_AA += rec->seq_n+1 - ma.aaa_offset | |
; if (ma.aaa_offset <= pam->cleanlength) | |
{ clean = 0 | |
; rec->rs.N_aa++ | |
; msg = "polya" | |
; } | |
else if (ma.aaa_offset < lint_offset) | |
lint_offset = ma.aaa_offset | |
;if(0)fprintf(stderr, "polya %d lint %d clean %d seq %s %d\n", (int) ma.aaa_offset, (int) lint_offset, (int) clean, rec->seq, (int) rec->rs.N) | |
; } | |
/* fixme: funcify */ | |
if (clean && pam->length_co > 0) | |
{ rec->rs.B_lco += rec->seq_n - pam->length_co | |
; if (pam->length_co+1 < lint_offset) | |
lint_offset = pam->length_co+1 | |
; } | |
if (clean && pam->trintco && pam->trintco < trintscore(rec->seq, lint_offset-1)) | |
{ clean = 0 | |
; rec->rs.N_tri++ | |
; msg = "trint" | |
; } | |
/* caller sets pam->the_tabu2 */ | |
if | |
( clean | |
&& pam->the_tabu2.tabulen | |
&& check_for_tabu(rec, lint_offset-1, pam, &pam->the_tabu2) | |
) | |
{ if (keepall) | |
lint_offset = 1 | |
; else | |
{ clean = 0 | |
; rec->rs.N_tabu++ | |
; } | |
msg = "tabu" | |
; } | |
clean |= keepall | |
; if (clean) | |
{ rec->rs.N_clean++ | |
; rec->clean_n = lint_offset - 1 | |
; rec->out_message = msg | |
; dump_read(1, fpclean, rec, &ai, &ma, pam) | |
; } | |
else | |
{ rec->rs.N_lint++ | |
; if (!msg) | |
msg = "no message (weird)" | |
, rec->rs.N_nomsg++ | |
; rec->out_message = msg | |
; dump_read(0, pam->fplint, rec, &ai, &ma, pam) | |
; rec->clean_n = -1 | |
; } | |
return 0 | |
; } | |
/* fixfixme: adaptor settings */ | |
int do_adaptor_3p | |
( struct record* rec | |
, struct param* pam | |
) | |
{ pam->adaptor_3p = pam->buck[0].adaptor3p | |
; pam->the_tabu2 = pam->buck[0].tabu2 | |
; pam->sinsert_5p = pam->buck[0].sinsert5p | |
#if 0 | |
= (pam->buck[bestid].sinsert5p_n) | |
? pam->buck[bestid].sinsert5p | |
: NULL | |
#endif | |
; X_CASCADE(ntnt(rec, pam, pam->fpclean)) | |
; if (rec->clean_n >= 0) | |
qc_tally_out(rec, pam, pam->buck_n, rec->clean_n) /* /specialbucket/ */ | |
; return 0 | |
; } | |
int do_barcode_3p | |
( struct record *rec | |
, struct param *pam | |
) | |
{ int NNN_offset = get_nnn_offset(rec->seq, rec->seq_n, pam->nnn_count, pam->nnn_outof) | |
; int BBB_offset = pam->modes & MODE_QC_LATE ? get_bbb_offset(rec) : -1 | |
; int lint_offset = 0 | |
; int indel_allowed = pam->mr_3p_global.mr_maxgap > 0 | |
; SWNUM data[MATRIX_SIZE] = { 0 } | |
; struct sw_alninfo ai = { 0 } | |
; int buckid, id_clean = 0, n_clean_best = 0 | |
; const char* msg = "nomatch" | |
; int score_best = 0, n_pass = 0 | |
; const char* cfl[10] = { "cfl0", "cfl1", "cfl2", "cfl3", "cfl4", "cfl5", "cfl6", "cfl7", "cfl8", "cflZ" } | |
; for (buckid = 0; buckid < pam->buck_n; buckid++) | |
{ int score = 0 | |
/* NOTE: | |
* We used to do restriction of search space | |
* (e.g. rec->seq + pam->cleanlength, limit | |
* search to start of adaptor). That restriction was dropped, | |
* but the reasons for dropping were not noted unfortunately. | |
*/ | |
; if | |
( sw_fill2 | |
( &ai | |
, data | |
, MATRIX_SIZE | |
, rec->seq | |
, pam->buck[buckid].cat3p | |
, indel_allowed | |
, &pam->swp_3p | |
) ) | |
X_FINISHED("sw error in do_barcode_3p") | |
; rec->rs.N_cells += ai.ni * ai.nj | |
; sw_trace3(&ai, &pam->swp_3p, ai.max_ij, 0) /* do_barcode_3p */ | |
; lint_offset = 1 | |
; if ((ai.lft_start + 1) > ai.rgt_start) | |
lint_offset = (ai.lft_start + 1) - ai.rgt_start | |
/* donow: ship barcode offset and length */ | |
; if | |
( sw_match_bc3p | |
( &ai | |
, &pam->mr_3p_global | |
, &pam->mr_3p_prefix | |
, &pam->mr_3p_barcode | |
, pam->buck[buckid].sinsert3p_n | |
, pam->buck[buckid].barcode_n | |
, &score /* ignored for now */ | |
) ) | |
{ score = ai.n_match - ai.n_insl - ai.n_insr - ai.n_subst | |
; n_pass++ | |
; if (score > score_best) | |
{ id_clean = buckid | |
; n_clean_best = 1 | |
; score_best = score | |
; } | |
else if (score == score_best) | |
n_clean_best++ | |
; pam->buck[buckid].cache_score = score | |
; pam->buck[buckid].cache_accept = 1 | |
; } | |
else | |
lint_offset = ai.ni | |
, pam->buck[buckid].cache_score = score | |
, pam->buck[buckid].cache_accept = 0 | |
; pam->buck[buckid].cache_lint_offset = lint_offset | |
; } | |
if (n_clean_best == 0) | |
rec->rs.N_nomatch++ | |
; if (n_clean_best == 1 && NNN_offset > 0) | |
{ rec->rs.B_NN += rec->seq_n+1 - NNN_offset | |
; if (NNN_offset <= pam->cleanlength) | |
{ n_clean_best = 0 | |
; rec->rs.N_nn++ | |
; msg = "NNN" | |
; } | |
else if (NNN_offset < lint_offset) | |
lint_offset = NNN_offset | |
; } | |
if (n_clean_best == 1 && BBB_offset > 0) | |
{ rec->rs.B_BB += rec->seq_n+1 - BBB_offset | |
; if (BBB_offset <= pam->cleanlength) | |
{ n_clean_best = 0 | |
; rec->rs.N_bb++ | |
; msg = "lowq" | |
; } | |
else if (BBB_offset < lint_offset) | |
lint_offset = BBB_offset | |
; } | |
/* The check for pam->cleanlength is so that we do | |
* not drop reads that have a virtual 'negative' | |
* offset of the first base of aligned adaptor. | |
*/ | |
if | |
( n_clean_best == 1 | |
&& pam->cleanlength | |
&& ai.lft_start + 1 <= pam->cleanlength + ai.rgt_start | |
) | |
{ n_clean_best = 0 | |
; msg = "len" | |
; rec->rs.N_lenlo++ | |
; } | |
/* Soft assertion: see comment below */ | |
if (n_clean_best == 1 && !lint_offset) | |
{ argh("reaper", "nasty zero lint offset, please fix code") | |
; lint_offset = 1 | |
; } | |
if | |
( n_clean_best == 1 | |
&& check_for_tabu(rec, lint_offset-1, pam, &pam->buck[id_clean].tabu2) | |
/* warning: in the above check (and further below as well) | |
* we need that lint_offset > 0. | |
* this should be garantueed by having a match and | |
* by restricting the search space, but | |
* with an all-zero alignment matrix it could | |
* still be icky. So there's an assertion above. | |
*/ | |
) | |
{ n_clean_best = 0 | |
; rec->rs.N_tabu++ | |
; msg = "tabu" | |
; } | |
if (n_clean_best == 1) | |
{ rec->clean_n = pam->buck[id_clean].cache_lint_offset - 1 | |
; rec->out_message = NULL | |
; dump_read(1, pam->buck[id_clean].fp, rec, NULL, NULL, pam) | |
; rec->rs.N_clean++ | |
#if 0 | |
; if (n_pass > 1) | |
rec->rs.N_cflr++ | |
#endif | |
; qc_tally_in(rec, pam, id_clean, rec->seq_n) | |
; qc_tally_out(rec, pam, id_clean, rec->clean_n) | |
; } | |
else | |
{ rec->out_message = n_clean_best ? cfl[n_clean_best > 9 ? 9 : n_clean_best] : msg | |
; dump_read(0, pam->fplint, rec, NULL, NULL, pam) | |
; qc_tally_out(rec, pam, pam->buck_n, rec->seq_n) /* specialbucket */ | |
; rec->rs.N_lint++ | |
; if (n_clean_best > 1) | |
rec->rs.N_cfl++ | |
; } | |
/* in do_barcode_3p, buck[].fp, fplint */ | |
if | |
( pam->verbose & VB_ALIGN | |
&& rec->rs.N % pam->sample_lint == 0 | |
) | |
{ for (buckid = 0; buckid < pam->buck_n; buckid++) | |
{ int score = 0 | |
; if | |
( sw_fill2 | |
( &ai | |
, data | |
, MATRIX_SIZE | |
, rec->seq | |
, pam->buck[buckid].cat3p | |
, indel_allowed | |
, &pam->swp_3p | |
) ) | |
X_FINISHED("sw error in do_barcode_3p") | |
; rec->rs.N_cells += ai.ni * ai.nj | |
; sw_trace3(&ai, &pam->swp_3p, ai.max_ij, 0) /* do_barcode_3p */ | |
; sw_match_bc3p | |
( &ai | |
, &pam->mr_3p_global | |
, &pam->mr_3p_prefix | |
, &pam->mr_3p_barcode | |
, pam->buck[buckid].sinsert3p_n | |
, pam->buck[buckid].barcode_n | |
, &score /* ignored */ | |
) | |
; sw_pp2 | |
( "read" | |
, x_3pbc[pam->buck[buckid].cache_accept] | |
, pam->buck[buckid].cache_score | |
, &ai | |
, pam->fplint | |
, pam->zippit | |
, rec->rs.N | |
, NULL | |
) | |
/* in do_barcode_3p */ | |
; } | |
} | |
return 0 | |
; } | |
int n_zip_mismatch | |
( const char* s1 | |
, const char* s2 | |
) | |
{ int n = 0 | |
; const char* a, *b | |
; for (a=s1,b=s2; *a && *b; a++, b++) | |
n += *a != *b | |
; return n | |
; } | |
int do_barcode_5p | |
( struct record *rec | |
, struct param *pam | |
) | |
{ int buckid = 0 | |
; int bestid = -1 | |
; struct match_requirement mr = pam->mr_5p_barcode | |
; int indel_allowed = mr.mr_maxgap > 0 | |
; int bestend = -1 | |
; int conflict = 0 | |
; SWNUM data[MATRIX_SIZE] = { 0 } | |
; struct sw_alninfo ai = { 0 } | |
; { for (buckid = 0; buckid < pam->buck_n; buckid++) | |
{ if (!strncmp(pam->buck[buckid].barcode5p, rec->seq, pam->buck[buckid].barcode_n)) | |
{ bestend = pam->buck[buckid].barcode_n | |
; break | |
; } | |
} | |
if (bestend > 0) | |
bestid = buckid | |
; } | |
/* mr.mr_offset & 1 indicates a 'zip' alignment, | |
* toe to toe and head to head aligned with only substitutions allowed. | |
*/ | |
if (bestid < 0 && mr.mr_maxedit > 0 && (mr.mr_offset & 1)) | |
{ for (buckid = 0; buckid < pam->buck_n; buckid++) | |
{ int z = INT_MAX | |
; if (pam->buck[buckid].barcode_n <= rec->seq_n) | |
z = n_zip_mismatch(pam->buck[buckid].barcode5p, rec->seq) | |
#if WE_USE_ZLIB | |
;if(0)gzprintf(pam->fplint, "code %s seq %s zip %d\n", pam->buck[buckid].barcode5p, rec->seq, z) | |
#endif | |
; if (z <= mr.mr_maxedit) | |
{ if (bestid >= 0) | |
{ conflict = 1 | |
#if WE_USE_ZLIB | |
; gzprintf | |
( pam->fplint | |
, "> 5p-conflict-zip %s %s [%s/%d] recno %d\n" | |
, pam->buck[bestid].barcode5p | |
, pam->buck[buckid].barcode5p | |
, rec->seq | |
, (int) rec->seq_n | |
, (int) rec->rs.N | |
) | |
#endif | |
; bestid = -1 | |
; break | |
; } | |
else | |
bestid = buckid | |
; } | |
} | |
if (!conflict && bestid >= 0) | |
bestend = pam->buck[bestid].barcode_n | |
; } | |
if (!conflict && bestid < 0 && (mr.mr_offset & 2) && mr.mr_maxedit) | |
{ int n_edit = 0 | |
; int n_gap = 0 | |
; for (buckid = 0; buckid < pam->buck_n; buckid++) | |
{ int found_alignment = 0 | |
; pam->swp_5p.right_limit = pam->buck[buckid].barcode_n + mr.mr_maxgap | |
; if (rec->seq_n < pam->buck[buckid].barcode_n) | |
continue | |
; if | |
( sw_fill2 | |
( &ai | |
, data | |
, MATRIX_SIZE | |
, pam->buck[buckid].barcode5p | |
, rec->seq | |
, indel_allowed | |
, &pam->swp_5p /* governs alignment algorithm, settings are very stable */ | |
) ) | |
X_FINISHED("sw error in do_barcode_5p") | |
; rec->rs.N_cells += ai.ni * ai.nj | |
; sw_trace3(&ai, &pam->swp_5p, ai.max_ij, 0) /* do_barcode_5p */ | |
; n_gap = ai.n_insl + ai.n_insr | |
; n_edit = ai.n_subst + n_gap | |
; found_alignment | |
= ( (ai.lft_end - ai.lft_start) + 1 + mr.mr_maxedit >= pam->buck[buckid].barcode_n + n_edit /* + (ai.rgt_start - 1) */ | |
&& mr.mr_maxedit >= n_edit | |
&& mr.mr_maxgap >= n_gap | |
) | |
;if(0)fprintf(stderr, "%d\n", (int) ((ai.lft_end - ai.lft_start) + 1)) | |
;if(0)fprintf(stderr, "found %d bc %s (%d %d)\n", (int) found_alignment, pam->buck[buckid].barcode5p, (int) n_edit, (int) (ai.rgt_start -1)) | |
/* in do_barcode_5p, buck[].fp, fplint */ | |
; if | |
( pam->verbose & VB_ALIGN | |
&& rec->rs.N % pam->sample_lint == 0 | |
) | |
sw_pp2("read", x_5pbc[found_alignment], -1, &ai, pam->fplint, pam->zippit, rec->rs.N, NULL) | |
/* in do_barcode_5p */ | |
; if (found_alignment) | |
{ if (bestid >= 0) | |
{ | |
#if WE_USE_ZLIB | |
; gzprintf(pam->fplint, "> 5p-conflict-align %s %s %s\n", pam->buck[bestid].barcode5p, pam->buck[buckid].barcode5p, rec->seq) | |
#endif | |
; bestid = -1 | |
; conflict = 1 | |
; break | |
; } | |
else | |
bestid = buckid | |
, bestend = ai.rgt_end + (pam->buck[bestid].barcode_n - ai.lft_end) | |
; } | |
} | |
} | |
if (bestid < 0) | |
{ rec->rs.N_lint++ | |
; rec->out_message = conflict ? "conflict" : "nobc" | |
; qc_tally_out(rec, pam, pam->buck_n, rec->seq_n) /* specialbucket */ | |
; dump_read(0, pam->fplint, rec, NULL, NULL, pam) | |
; if (conflict) | |
rec->rs.N_cfl++ | |
; else | |
rec->rs.N_nomatch++ | |
; } | |
else | |
{ qc_tally_in(rec, pam, bestid, rec->seq_n) /* tally in per-barcode */ | |
#if WE_USE_ZLIB | |
;if(0)gzprintf(pam->fplint, "# %s %s\n", pam->buck[bestid].barcode5p, rec->seq) | |
#endif | |
; rec_remove_prefix(rec, bestend) /* inclusive end in 1-based offset == length [12345] */ | |
; pam->sinsert_5p | |
= (pam->buck[bestid].sinsert5p_n) | |
? pam->buck[bestid].sinsert5p | |
: NULL | |
; pam->adaptor_3p = pam->buck[bestid].adaptor3p | |
; pam->the_tabu2 = pam->buck[0].tabu2 | |
; X_CASCADE(ntnt(rec, pam, pam->buck[bestid].fp)) | |
; if (rec->clean_n >= 0) | |
qc_tally_out(rec, pam, bestid, rec->clean_n) | |
; } | |
return 0 | |
; } | |
int parse_mr | |
( const char* form | |
, struct match_requirement* mr | |
, int maxnums | |
) | |
{ int n_consumed, ok = 1 | |
; mr->mr_maxgap = 0 | |
; do | |
{ if | |
( maxnums >= 4 | |
&& 4 == sscanf(form, "%u/%u/%u/%d%n", | |
&mr->mr_minlen, &mr->mr_maxedit, &mr->mr_maxgap, &mr->mr_offset, &n_consumed) | |
) | |
break | |
; if | |
( maxnums >= 3 | |
&& 3 == sscanf(form, "%u/%u/%u%n", | |
&mr->mr_minlen, &mr->mr_maxedit, &mr->mr_maxgap, &n_consumed) | |
) | |
break | |
; if | |
( maxnums >= 2 | |
&& 2 == sscanf(form, "%u/%u%n", | |
&mr->mr_minlen, &mr->mr_maxedit, &n_consumed) | |
) | |
break | |
; ok = 0 | |
; } | |
while (0) | |
; if (ok && mr->mr_maxedit < mr->mr_maxgap) | |
{ argh | |
( "error" | |
, "maximum gap count %d should not exceed maximum edit distance %d in %s" | |
, ji mr->mr_maxgap | |
, ji mr->mr_maxedit | |
, form | |
) | |
; mr->mr_maxedit = mr->mr_maxgap | |
; argh("fixit", "setting maximum edit distance to %d", ji mr->mr_maxedit) | |
; } | |
return ok && n_consumed == strlen(form) | |
; } | |
int parse_anchor | |
( const char* form | |
, int *d | |
, const char **anchor | |
) | |
{ int n_consumed = 0 | |
; if (1 != sscanf(form, "%d/%n", d, &n_consumed)) | |
return 0 | |
; if (n_consumed == 0 || form[n_consumed-1] != '/') | |
return 0 | |
; d[0] -= 1 | |
; anchor[0] = form + n_consumed | |
; return 1 | |
; } | |
int parse_pair | |
( const char* form | |
, unsigned *u1 | |
, unsigned *u2 | |
) | |
{ int n_consumed | |
; if (2 != sscanf(form, "%u/%u%n", u1, u2, &n_consumed)) | |
return 0 | |
; if (n_consumed != strlen(form)) | |
return 0 | |
; return 1 | |
; } | |
int parse_dust_spec | |
( const char* form | |
, unsigned *cutoff | |
, unsigned *bases | |
) | |
{ int n_converted = 0 | |
; if | |
( 2 != sscanf(form, "%u/%n", cutoff, &n_converted) | |
&& 1 != sscanf(form, "%u", cutoff) | |
) | |
return 0 | |
; if (n_converted > 0) | |
{ const char* o = form + n_converted | |
; if (strchr(o, 'A')) bases[0] |= 1 << 0 | |
; if (strchr(o, 'C')) bases[0] |= 1 << 1 | |
; if (strchr(o, 'G')) bases[0] |= 1 << 2 | |
; if (strchr(o, 'T')) bases[0] |= 1 << 3 | |
; } | |
return 1 | |
; } | |
static volatile sig_atomic_t g_abort_loop = 0; | |
void reaper_sig_catch | |
( int sig | |
) | |
{ if (sig == SIGINT) | |
g_abort_loop = 1 | |
; } | |
int reaper_main | |
( int argc | |
, const char* argv[] | |
) | |
{ ZFILE input = NULL | |
; const char* g_fnin = "-" | |
; char* fnclean = NULL, *fnlint = NULL | |
; unsigned g_limit = 0 | |
; const char* g_format_in = "@%I%A%n%R%n+%#%Q%n" | |
; const char* g_format_in2 = NULL | |
; int status = 19 | |
; int called_from_R = 0 | |
/* below initialises read_stats_global (rec.rs) */ | |
; struct record rec = { { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, 0 } | |
; struct param pam = { 0 } | |
#ifdef WINDOWS_BUILD | |
; set_argh("reaperlog.txt", 1) | |
#endif | |
; param_init(&pam) | |
; g_abort_loop = 0 | |
; signal(SIGINT, reaper_sig_catch) | |
; themap_init() | |
; | |
/* enter macromagicalitaciousness */ | |
arg_switch() | |
optarg("-i") g_fnin = thearg(); endarg() | |
optarg("-fastq") g_fnin = thearg(); endarg() | |
optarg("-clean-length") pam.cleanlength = atoi(thearg()); endarg() | |
optarg("-polya") pam.polya = atoi(thearg()); endarg() | |
optarg("-dust-suffix") | |
if (!parse_dust_spec(thearg(), &pam.taildust, &pam.taildust_bases)) | |
X_ERR_JUMP(DONE, "-dust-suffix argument needs <cut-off> or <cutoff>/<ACGT> pattern"); | |
endarg() | |
optarg("-dust-suffix-late") | |
if (!parse_dust_spec(thearg(), &pam.taildustlate, &pam.taildustlate_bases)) | |
X_ERR_JUMP(DONE, "-dust-suffix-late argument needs <cut-off> or <cutoff>/<ACGT> pattern"); | |
endarg() | |
optarg("-qqq-check") | |
if | |
( 4 != sscanf(thearg(), "%u/%u/%u/%u", &pam.q_cutoff, &pam.q_winlen, &pam.q_winofs, &pam.q_readofs) | |
&& 3 != sscanf(thearg(), "%u/%u/%u", &pam.q_cutoff, &pam.q_winlen, &pam.q_winofs) | |
&& 2 != sscanf(thearg(), "%u/%u", &pam.q_cutoff, &pam.q_winlen) | |
) | |
X_ERR_JUMP(DONE, "-medq argument needs <cutoff>/<winlen>[/<winoffset>] pattern"); | |
if (pam.q_winofs > pam.q_winlen) pam.q_winofs = pam.q_winlen; | |
if (pam.q_winlen > 255) pam.q_winlen = 255; | |
if (pam.q_winlen % 2 == 0) pam.q_winlen++; | |
if (!pam.q_winofs) pam.q_winofs = (pam.q_winlen + 1) / 2; | |
endarg() | |
optarg("-anchor") | |
if (!parse_anchor(thearg(), &pam.anchor_offset, &pam.anchor_string)) | |
X_ERR_JUMP(DONE, "-anchor argument does not follow <OFFSET>/<BASES> pattern"); | |
fprintf(stderr, "filtering on substring %s at position %d\n", pam.anchor_string, pam.anchor_offset+1); | |
endarg() | |
optarg("-tri") | |
pam.trintco = atoi(thearg()); | |
if (pam.trintco < 0 || pam.trintco > 100) { | |
argh("reaper", "-trint option should take argument in [0..100]"); | |
pam.trintco = 0; | |
} | |
endarg() | |
optarg("-mr-tabu") | |
if (!parse_mr(thearg(), &pam.mr_tabu, 4)) | |
X_ERR_JUMP(DONE, "-mr-tabu argument needs x/y or x/y/z or x/y/z/u pattern"); | |
endarg() | |
optarg("-5p-barcode") | |
if (!parse_mr(thearg(), &pam.mr_5p_barcode, 4)) | |
X_ERR_JUMP(DONE, "-5p-barcode argument needs x/y or x/y/z or x/y/z/u pattern"); | |
endarg() | |
optarg("-5p-sinsert") | |
if (!parse_mr(thearg(), &pam.mr_5p_require, 4)) | |
X_ERR_JUMP(DONE, "-5p-sinsert argument needs x/y or x/y/z or x/y/z/u pattern"); | |
endarg() | |
optarg("-3p-global") | |
if (!parse_mr(thearg(), &pam.mr_3p_global, 4)) | |
X_ERR_JUMP(DONE, "-3p-global argument needs x/y or x/y/z or x/y/z/u pattern"); | |
endarg() | |
optarg("-3p-prefix") | |
if (!parse_mr(thearg(), &pam.mr_3p_prefix, 4)) | |
X_ERR_JUMP(DONE, "-3p-prefix argument needs x/y or x/y/z or x/y/z/u pattern"); | |
endarg() | |
optarg("-3p-barcode") | |
if (!parse_mr(thearg(), &pam.mr_3p_barcode, 3)) | |
X_ERR_JUMP(DONE, "-3p-barcode argument needs x/y or x/y/z pattern"); | |
endarg() | |
optarg("-3p-head-to-tail") pam.p_3p_tth = atoi(thearg()); endarg() /* suffix perfect match */ | |
optarg("-swp") | |
if (3 != sscanf(thearg(), "%u/%u/%u", | |
&pam.swp_3p.gain_match, &pam.swp_3p.cost_subst, &pam.swp_3p.cost_gapleft) | |
) | |
X_ERR_JUMP(DONE, "-swp argument needs MATCH/SUBS/GAP pattern (gain/cost/cost)"); | |
pam.swp_3p.cost_gapright = pam.swp_3p.cost_gapleft; | |
pam.swp_5p = pam.swp_3p; | |
pam.swp_tabu = pam.swp_3p; | |
pam.swp_raptor= pam.swp_3p; | |
endarg() | |
optarg("-loco") | |
pam.loco_check = atof(thearg()); | |
endarg() | |
optarg("-geom") | |
if (!strcmp(thearg(), "3p-bc")) | |
g_mode = CASE_3P_BC | |
; else if (!strcmp(thearg(), "5p-bc")) | |
g_mode = CASE_5P_BC | |
; else if (!strcmp(thearg(), "no-bc")) | |
g_mode = CASE_NOBC | |
; else | |
X_ERR_JUMP(DONE, "unsupported geometry [%s]", thearg()) | |
; | |
endarg() | |
optarg("-meta") pam.fnmeta = thearg(); endarg() | |
optarg("-record-format2") g_format_in2 = thearg(); endarg() | |
optarg("-record-format") g_format_in = thearg(); endarg() | |
uniarg("--fasta-in") g_format_in = ">%I%A%n%R%n"; endarg() | |
uniarg("--fastqx-out") | |
pam.format_clean = "@%I%trecno=%J%n%C%n+%n%Q%n"; | |
pam.format_lint = "@%I%trecno=%J%tmsg=%M%n%R%n+%n%Y%n"; | |
endarg() | |
uniarg("--fasta-out") | |
pam.format_clean = ">%I%n%C%n"; | |
pam.format_lint = ">%I%tmsg=%M%n%R%n"; | |
endarg() | |
uniarg("--fastax-out") | |
pam.format_clean = ">%I%trecno=%J%n%C%n"; | |
pam.format_lint = ">%I%trecno=%J%tmsg=%M%n%R%n"; | |
endarg() | |
optarg("-sc-max") pam.suffix_complexity_max = atof(thearg()); endarg() | |
optarg("-nnn-check") | |
if (!parse_pair(thearg(), &pam.nnn_count, &pam.nnn_outof)) | |
X_ERR_JUMP(DONE, "-nnn-check argument does not follow x/y pattern"); | |
endarg() | |
optarg("-sample") | |
if (!parse_pair(thearg(), &pam.sample_clean, &pam.sample_lint)) | |
X_ERR_JUMP(DONE, "-sample argument does not follow x/y pattern"); | |
endarg() | |
optarg("-debug") | |
if (strchr(thearg(), 'a')) pam.verbose |= VB_ALIGN; | |
if (strchr(thearg(), 'c')) pam.verbose |= VB_CLEAN; | |
if (strchr(thearg(), 'l')) pam.verbose |= VB_LINT; | |
endarg() | |
optarg("-do") g_limit = atoi(thearg()); endarg() | |
uniarg("--nozip") pam.zippit = 0; endarg() | |
uniarg("--R") called_from_R = 1; endarg() | |
uniarg("--noqc") BIT_OFF(pam.modes, MODE_QC_FULL);endarg() | |
uniarg("--bcq-early") BIT_ON(pam.modes, MODE_QC_EARLY);endarg() | |
uniarg("--bcq-late") BIT_ON(pam.modes, MODE_QC_LATE); endarg() | |
uniarg("--full-length") BIT_ON(pam.modes, MODE_FULLLENGTH ); endarg() | |
uniarg("--keep-all") BIT_ON(pam.modes, MODE_KEEPALL ); endarg() | |
optarg("-3pa") pam.adaptor_3p = hte(thearg()); endarg() | |
optarg("-raptor") pam.adaptor_test = thearg(); endarg() | |
optarg("-kmer") pam.kmer = atoi(thearg()); endarg() | |
optarg("-5psi") pam.sinsert_5p = hte(thearg()); endarg() | |
optarg("-tabu") parse_tabu(&pam.the_tabu2, hte(thearg()), g_restrict); endarg() | |
optarg("-basename") pam.basename = thearg(); endarg() | |
optarg("-format-clean") pam.format_clean = thearg(); endarg() | |
optarg("-length-limit") pam.length_limit = atoi(thearg()); endarg() | |
optarg("-format-lint") pam.format_lint = thearg(); endarg() | |
optarg("-trim-length") pam.length_co = atoi(thearg()); endarg() | |
uniarg("--stream-clean") BIT_ON(pam.modes, MODE_STREAMCLEAN); endarg() | |
uniarg("--record-format") | |
fprintf(stdout, "-record-format <format string> (extended record description syntax)\n"); | |
puts(" %R expect read (longest sequence found over [a-zA-Z]*) - empty read allowed"); | |
puts(" %I expect identifier (longest sequence of non-blank)"); | |
puts(" %A expect overflow annotation field (initial blanks skipped, rest of line)"); | |
puts(" %a use previous %F or %G or %H field as annotation"); | |
puts(" %B expect annotation field (longest sequence of non-blank)"); | |
puts(" %Q expect quality (longest sequence of non-blank)"); | |
puts(" %X expect count (a nonnegative integer number)"); | |
puts(" %F expect and discard field (longest sequence of non-tab)"); | |
puts(" %G expect and discard field (longest sequence of non-blank)"); | |
puts(" %Hx (x is a placeholder) expect and discard up to x"); | |
puts(" %# discard everything until end of line"); | |
puts(" %b expect run of blanks (space or tab)"); | |
puts(" %n expect end of line match"); | |
puts(" %. expect and discard any character"); | |
puts(" %s expect a space"); | |
puts(" %t expect a tab"); | |
puts(" %% expect a percent sign"); | |
puts("Anything else requires a literal match, see the FASTA example just below"); | |
puts("The string '>%I%#%R%n' will parse FASTA-ish input (but use --fasta-in instead)"); | |
puts("The string '%F%t%R%#' retrieves the second field from each line"); | |
status = 0; | |
goto DONE; | |
endarg() | |
optarg("-restrict") | |
g_restrict = atoi(thearg()); | |
endarg() | |
optarg("-guard") | |
g_guard = atoi(thearg()); | |
endarg() | |
uniarg("-z") | |
fprintf(stdout, "Struct param: %d\n", (int) sizeof(struct param)); | |
fprintf(stdout, "Struct bucket: %d\n", (int) sizeof(struct bucket)); | |
status = 0; | |
goto DONE; | |
endarg() | |
uniarg("--version") | |
fprintf(stdout, "Reaper version: %s\n", reaper_tag); | |
status = 0; | |
goto DONE; | |
endarg() | |
uniarg("-h") | |
puts("\nRequired options"); | |
puts("-geom <mode> mode in {no-bc, 3p-bc, 5p-bc}"); | |
puts("-meta <fname> file with geometry-dependent format. Required columns:"); | |
puts(" Geometry Columns:"); | |
puts(" no-bc 3p-ad - - - tabu"); | |
puts(" 3p-bc 3p-ad barcode 3p-si - tabu"); | |
puts(" 5p-bc 3p-ad barcode - 5p-si tabu"); | |
puts("bc=barcode, ad=adaptor, si=sequence insert"); | |
puts("Columns 3p-si, 5p-si, 3p-ad and tabu may all be empty"); | |
puts("Alternatively, to express absence, a single hyphen may be used"); | |
puts("\nImportant options"); | |
puts("-i <fname> input stream (gzipped file allowed) (default STDIN)"); | |
fprintf(stdout, "-clean-length <int> minimum allowed clean length (default %d)\n", (int) pam.cleanlength); | |
puts("-guard <int> protect first <int> bases in read from adapter and tabu matching"); | |
fprintf(stdout, "-restrict <int> only use the first <int> bases of adapter or tabu sequence (default %d)\n", (int) g_restrict); | |
puts(" This is to avoid false positive matches"); | |
puts("-tri <threshold> filter out reads with tri-nt score > threshold"); | |
puts(" a reasonable <threshold> is 35"); | |
fprintf(stdout, "-qqq-check <val>/<winlen> cut sequence when median quality drops below <val>\n"); | |
fprintf(stdout, "-qqq-check <val>/<winlen>/<winofs> as above, cut at <winofs> (default %d)\n", (int) pam.q_winofs); | |
fprintf(stdout, "-qqq-check <val>/<winlen>/<winofs>/<readofs> as above, start at <readofs>\n"); | |
fprintf(stdout, "-dust-suffix <threshold> dust theshold for read suffix (default %d, suggested 20)\n", (int) pam.taildust); | |
fprintf(stdout, "-nnn-check <count>/<outof> (default %d/%d)\n", (int) pam.nnn_count, (int) pam.nnn_outof); | |
puts(" disregard read onwards from seeing <count> N's in <outof> bases"); | |
puts("\nAlignment options"); | |
puts("Options to specify when part of an alignment triggers a match:"); | |
#define local_EXPAND_MR(mr) (int) mr.mr_minlen, (int) mr.mr_maxedit, (int) mr.mr_maxgap, (int) mr.mr_offset | |
fprintf(stdout, "-3p-global l/e[/g[/o]] (default %d/%d/%d/%d)\n", local_EXPAND_MR(pam.mr_3p_global)); | |
fprintf(stdout, "-3p-prefix l/e[/g[/o]] (default %d/%d/%d/%d)\n", local_EXPAND_MR(pam.mr_3p_prefix)); | |
fprintf(stdout, "-3p-barcode l/e[/g[/o]] (default %d/%d/%d/%d)\n", local_EXPAND_MR(pam.mr_3p_barcode)); | |
fprintf(stdout, "-5p-barcode l/e[/g[/o]] (default %d/%d/%d/%d)\n", local_EXPAND_MR(pam.mr_5p_barcode)); | |
fprintf(stdout, "-5p-sinsert l/e[/g[/o]] (default %d/%d/%d/%d)\n", local_EXPAND_MR(pam.mr_5p_require)); | |
fprintf(stdout, "-mr-tabu l/e[/g[/o]] (default %d/%d/%d/%d)\n", local_EXPAND_MR(pam.mr_tabu)); | |
fprintf(stdout, "-3p-head-to-tail l minimal trailing perfect match length (default %d)\n", (int) pam.p_3p_tth); | |
#undef local_EXPAND_MR | |
puts(" syntax used in the above:"); | |
puts(" l <int> minimum length required to count sub-alignment as match"); | |
puts(" e <int> maximum allowed edit distance"); | |
puts(" g <int> [optional, not active when set to 0] maximum allowed number of gaps"); | |
puts(" o <int> [optional, not active when set to 0] offset:"); | |
puts(" o= 5 requires alignment to start in the first five bases of adaptor"); | |
puts(" o=-5 requires alignment to end in the last five bases of adaptor"); | |
fprintf(stdout, "-swp M/S/G match/substitution/gap gain/cost/cost (default %u/%u/%u)\n", | |
pam.swp_3p.gain_match, pam.swp_3p.cost_subst, pam.swp_3p.cost_gapleft); | |
puts("\nInput/output options"); | |
puts("--fasta-in read FASTA input"); | |
fprintf(stdout, "-record-format <format string> (record description, default %s)\n", g_format_in); | |
puts(" [ -record-format syntax is output when supplying --record-format ]"); | |
puts("-record-format2 <format string> (simple line formats, one field per line):"); | |
puts(" R read"); | |
puts(" I read identifier"); | |
puts(" Q quality scores"); | |
puts(" D discard field"); | |
puts(""); | |
puts("-basename <pfx> pfx.lint.gz, pfx.clean.gz pfx.report etc will be constructed"); | |
puts("-format-clean <format string> (output for clean reads)"); | |
puts("-format-lint <format string> (output for filtered reads)"); | |
puts(" -format-clean/lint specification syntax:"); | |
puts(" %R read"); | |
puts(" %C clean read"); | |
puts(" %Z clean read padded with Ns if necessary"); | |
puts(" %V reverse complement of clean read"); | |
puts(" %I read identifier"); | |
puts(" %Q clean or input read quality (for clean / lint file respectively)"); | |
puts(" %X read count (only applicable if -record-format is used)"); | |
puts(" %Y input read quality"); | |
puts(" %q<c> clean input read quality padded with character <c>"); | |
puts(" %A annotation field"); | |
puts(" %L clean read length"); | |
puts(" %M message describing cause for filtering (lint file)"); | |
puts(" %T trinucleotide complexity score (clean/lint file)"); | |
puts(" %U dUst sUffix complexity information"); | |
puts(" %3 best read/3p-adaptor alignment"); | |
puts(" %= alignment characteristics"); | |
puts(" mt=matchtype"); | |
puts(" sc=suffix-complexity"); | |
puts(" ht=head-tail-match"); | |
puts(" nn=N-match-offset"); | |
puts(" bb=B-match-offset"); | |
puts(" aa=Polya-offset"); | |
puts(" qq=Quality-trim-offset"); | |
puts(" %n newline"); | |
puts(" %J record offset, unique for each read. Use to match paired-end reads"); | |
puts(" %f fastq line number based on standard fastq format"); | |
puts(" %t tab"); | |
puts(" %% percent sign"); | |
puts(" Anything else is copied verbatim"); | |
puts("-debug [acl]+ a=alignments c=clean l=lint"); | |
puts("-sample c/l if debug, sample every c/l clean/lint read"); | |
puts("--nozip do not output gzipped files"); | |
puts("--noqc do not output quality report files"); | |
puts("\nMiscellaneous options"); | |
puts("--bcq-early perform early 'B' quality filtering (when reading sequences)"); | |
puts("--bcq-late perform late 'B' quality filtering (before outputting sequences)"); | |
puts("--full-length only allow reads not shortened in any filter step"); | |
puts("--keep-all delete rather than discard reads (e.g. tabu match, missing 5p-sinsert)"); | |
puts("-trim-length <int> cut reads back to length <int>"); | |
fprintf(stdout, "-polya <int> remove trailing A's if length exceeds <int>\n"); | |
fprintf(stdout, "-sc-max <f> threshold for complexity of suffix following prefix match (default %.2f)\n", pam.suffix_complexity_max); | |
puts("\nOptions for use when running reaper with -geom no-bc"); | |
puts("-3pa <three prime adaptor>"); | |
puts("-5psi <five prime sequence insert>"); | |
puts("-tabu <tabu sequence>"); | |
puts(""); | |
status = 0; | |
X_ERR_JUMP(DONE, "DON'T PANIC"); | |
endarg() | |
failarg() | |
arg_done() | |
/* curtains macromagicalitaciousness */ | |
; { unsigned dummy | |
; kraken_readline(NULL, NULL, 0, NULL, &dummy) /* resets static buffers */ | |
; } | |
if (called_from_R) | |
argh("reaper", "R is calling") | |
; if (g_mode == CASE_NOTSPECIFIED) | |
X_ERR_JUMP(DONE, "-geom option required (see -h for available options)") | |
; else if (g_mode > CASE_NOBC && !pam.fnmeta) | |
X_ERR_JUMP(DONE, "-meta option required with geometry %s", G_MODE[g_mode]) | |
/* fixme: the logic of barcodes_io_open | |
* versus fpclean is somewhat entangled | |
* and dispersed throughout. | |
*/ | |
; if (pam.fnmeta) | |
{ if (config_read(pam.fnmeta, &pam, g_mode, g_restrict)) | |
X_ERR_JUMP(DONE, "failed to parse barcode spec file") | |
; if | |
( g_mode != CASE_NOBC | |
&& barcodes_io_open(pam.buck, pam.basename, pam.buck_n, pam.zippit) | |
) | |
X_ERR_JUMP(DONE, "error opening barcode bucket files") | |
; X_TRY_JUMP(DONE, buckets_prepare_qc(pam.buck, pam.buck_n, g_mode)) | |
; if (g_mode == CASE_3P_BC) | |
{ pam.swp_3p.left_skip = pam.cleanlength | |
; pam.swp_3p.right_limit = pam.mr_3p_global.mr_minlen /* fixme document */ | |
; pam.swp_3p.flags |= SW_ENDTOSTART | |
; pam.p_3p_tth = 0 | |
; } | |
/* retireme/fixme: the below is spurious given call above. */ | |
#if 0 | |
/* fixme: this is a bit of a kludge: separate tally/qc stuff */ | |
; if (g_mode == CASE_NOBC) | |
X_TRY_JUMP(DONE, buckets_prepare_qc(pam.buck, pam.buck_n, g_mode)) | |
#endif | |
; } | |
else if (g_mode == CASE_NOBC) | |
{ X_TRY_JUMP(DONE, buckets_alloc(&pam, 1)) | |
; X_TRY_JUMP(DONE, adaptors_set_case_3p(pam.buck, &pam.buck_n, &pam)) | |
; X_TRY_JUMP(DONE, buckets_prepare_qc(pam.buck, pam.buck_n, g_mode)) | |
/* note: in this branch we do not open io for (the one) bucket */ | |
; } | |
else | |
X_ERR_JUMP(DONE, "-meta option is required for all modes except '3p-ad'") | |
; fnclean | |
= stringle | |
( "%s.%s.%s" | |
, pam.basename | |
, (g_mode == CASE_NOBC) ? "lane" : "ocean" /* ocean case should not materialise */ | |
, pam.zippit ? "clean.gz" : "clean" | |
) | |
; fnlint = stringle("%s.%s", pam.basename, pam.zippit ? "lint.gz" : "lint") | |
; | |
if (!(input = myfopen(g_fnin, "r", 1))) | |
X_ERR_JUMP(DONE, "bailing out") | |
; if (g_mode == CASE_NOBC) | |
{ if (pam.modes & MODE_STREAMCLEAN) | |
fnclean = stringle("-") | |
; if (!(pam.fpclean = myfopen(fnclean, ZWMODE, pam.zippit))) | |
X_ERR_JUMP(DONE, "bailing out") | |
; } | |
if (!(pam.fplint = myfopen(fnlint, ZWMODE, pam.zippit))) | |
X_ERR_JUMP(DONE, "bailing out") | |
; { clock_t t1 = clock(), t2 | |
; struct read_stats_global rsprev = { 0 } /* read stats per million */ | |
; int anchor_len = 0 | |
; if (pam.anchor_string) | |
anchor_len = strlen(pam.anchor_string) | |
; fprintf(stderr, "---\n") | |
; fprintf(stderr, "mRpm million reads per minute\n") | |
; fprintf(stderr, "mNpm million nucleotides per minute\n") | |
; fprintf(stderr, "mCps million alignment cells per second\n") | |
; fprintf(stderr, "lint total removed reads (per 10K), sum of columns to the left\n") | |
; fprintf | |
( stderr | |
, "%-37s %7s %3s %4s %4s %4s %6s%3s %4s %4s %4s %4s %4s %4s %4s %4s %4s%s\n" | |
, "25K reads per dot, 1M reads per line" | |
, "seconds", "mr", "mRpm", "mNpm", "mCps" | |
, "{error", "qc", "low", "len", "NNN", "tabu", "nobc", "cflr", "cfl", "lint", "OK", "} per 10K" | |
) | |
; do | |
{ unsigned rrstat = g_format_in2 | |
? read_record2(input, g_format_in2, &rec, pam.length_limit) | |
: read_recordx(input, g_format_in, &rec) | |
; if (rec.seq_n > MATRIX_SIZE / 24) | |
rec.seq_n = MATRIX_SIZE / 24 | |
, rec.seq[rec.seq_n] = '\0' | |
; if (rrstat == RR_DONE) | |
break | |
; else if (rrstat == RR_NOMEM || rrstat == RR_ERROR) | |
goto DONE | |
; if (pam.modes & MODE_FULLLENGTH) | |
pam.cleanlength = rec.seq_n | |
; rec.rs.B += rec.seq_n | |
; do | |
{ qc_tally_in(&rec, &pam, pam.buck_n, rec.seq_n) /* specialbucket */ | |
; if (pam.anchor_offset >= 0 && pam.anchor_string) | |
{ if | |
( pam.anchor_offset <= rec.seq_n | |
&& strncmp(rec.seq+pam.anchor_offset, pam.anchor_string, anchor_len) | |
) | |
{ rec.rs.N_anchor++ | |
; rec.rs.N_lint++ | |
; break | |
; } | |
} | |
if (pam.modes & MODE_QC_EARLY) | |
{ int bbb_ofs | |
; if ((bbb_ofs = get_bbb_offset(&rec)) > 0) | |
{ rec.rs.B_BB += rec.seq_n + 1 - bbb_ofs | |
; if (bbb_ofs <= pam.cleanlength) | |
{ rec.rs.N_bb++; rec.rs.N_lint++; break ; } | |
else | |
{ rec.seq_n = bbb_ofs-1 | |
; rec.q_n = bbb_ofs-1 | |
; rec.seq[bbb_ofs-1] = '\0' | |
; rec.q[bbb_ofs-1] = '\0' | |
; } | |
} | |
} | |
if (pam.loco_check && loco(&rec, &pam, pam.loco_check)) | |
{ rec.rs.N_loco++; rec.rs.N_lint++; break ; } | |
switch(g_mode) | |
{ case CASE_NOBC | |
: do_adaptor_3p(&rec, &pam); break | |
; | |
case CASE_3P_BC | |
: do_barcode_3p(&rec, &pam); break | |
; | |
case CASE_5P_BC | |
: do_barcode_5p(&rec, &pam); break | |
; | |
case CASE_NOTSPECIFIED | |
: X_ERR_JUMP(DONE, "impossible") | |
; } | |
/* pam->verbose & VB_ALIGN */ | |
; } | |
while (0) | |
; if (25000 * (rec.rs.N / 25000) == rec.rs.N) | |
{ fputc('.', stderr) | |
; if (rec.rs.N % 1000000 == 0) | |
{ double nsecs | |
; t2 = clock() | |
; unsigned diffn = (rec.rs.N - rsprev.N) | |
; unsigned diffnt = (rec.rs.B - rsprev.B) | |
; unsigned diffsw = (rec.rs.N_cells - rsprev.N_cells) | |
; nsecs = (t2 - t1) * 1.0 / CLOCKS_PER_SEC | |
; fprintf | |
( stderr | |
, " %4.0f %3d %4.1f %4.0f %4.0f" | |
, nsecs | |
, (int) (rec.rs.N / 1000000) | |
, (double) (60.0 / nsecs) | |
, (double) (diffnt * 1.0 / (nsecs * 1000000.0 / 60.0)) | |
, (double) (diffsw * 1.0 / (nsecs * 1000000.0)) | |
) | |
; fprintf | |
( stderr | |
, " %4d %4d %4d %4d %4d %4d %4d %4d %4d %4d %4d\n" | |
, (int) (10000.5 * (rec.rs.N_err - rsprev.N_err) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_bb - rsprev.N_bb ) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_loco - rsprev.N_loco) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_lenlo - rsprev.N_lenlo) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_nn - rsprev.N_nn) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_tabu - rsprev.N_tabu) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_nomatch - rsprev.N_nomatch) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_cflr - rsprev.N_cflr) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_cfl - rsprev.N_cfl) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_lint - rsprev.N_lint) * 1.0 / diffn) | |
, (int) (10000.5 * (rec.rs.N_clean - rsprev.N_clean) * 1.0 / diffn) | |
) | |
; t1 = t2 | |
; rsprev = rec.rs | |
; } | |
} | |
} | |
while (!g_abort_loop && (!g_limit || rec.rs.N < g_limit)) | |
; fputc('\n', stderr) | |
; if | |
( rec.rs.N_bb + rec.rs.N_loco | |
+ rec.rs.N_lenlo + rec.rs.N_nn | |
+ rec.rs.N_tabu + rec.rs.N_cfl | |
+ rec.rs.N_5p_nosinsert + rec.rs.N_nomatch | |
+ rec.rs.N_aa | |
+ rec.rs.N_tri + rec.rs.N_qq | |
+ rec.rs.N_anchor | |
!= rec.rs.N_lint | |
) | |
argh("reaper", "(not overly important) discarded category counts do not add up to lint total") | |
; if (rec.rs.N_clean + rec.rs.N_lint + rec.rs.N_err != rec.rs.N) | |
argh("reaper", "(not overly important) dispatch category counts do not add up to input total") | |
; if (rec.rs.N_nomsg) | |
argh("reaper", "(not overly important) there were %u discarded reads with no message", rec.rs.N_nomsg) | |
; qc_brief(&pam, &rec, g_mode) | |
; if (g_fieldtosmall) | |
argh("reaper", "___ WARNING ___ %lu cases where field was truncated", (long unsigned) g_fieldtosmall) | |
; if (pam.modes & MODE_QC_FULL) | |
qc_report(&pam, &rec, g_mode) | |
; } | |
status = 0 | |
; DONE: | |
; myfzclose(input, 1) | |
; myfzclose(pam.fpclean, pam.zippit) | |
; myfzclose(pam.fplint, pam.zippit) | |
; barcodes_io_close(pam.buck, pam.buck_n, pam.zippit) | |
; if (0) data_release(&pam) | |
; if (fnclean) free(fnclean) | |
; if (fnlint) free(fnlint) | |
; close_argh() | |
; return status | |
; } | |
#ifndef BUILD_R_BINDINGS | |
int main | |
( int argc | |
, const char* argv[] | |
) | |
{ return reaper_main(argc, argv) | |
; } | |
#endif | |
#if 0 | |
#ifdef RREAPERR | |
SEXP R_dispatchee | |
( SEXP list | |
, int (*themain)(int , const char* []) | |
, const char* callee | |
) | |
{ /* SEXP elmt = R_NilValue */ | |
; SEXP result | |
; SEXP names = getAttrib(list, R_NamesSymbol) | |
; int argc = 1 | |
; const char* argv[100] = { 0 } | |
; argv[0] = "dummy-internal" | |
; int *res = NULL | |
; int i = 0 | |
; PROTECT(result = allocVector(INTSXP, 1)) | |
; res = INTEGER(result) | |
; if (length(list) <= 49) /* space for argv[0] and final NULL */ | |
for (i = 0; i < length(list); i++) | |
{ const char* key = CHAR(STRING_ELT(names, i)) | |
; const char* val = CHAR(STRING_ELT(list, i)) | |
; if (!strlen(key)) | |
{ if (!strncmp(val, "--", 2)) | |
argv[argc++] = val | |
; else | |
{ argh(callee, "argument %s is not of type \"--mode\"", val) | |
; break | |
; } | |
} | |
else | |
{ if (!strncmp(key, "--", 2)) | |
{ if (strlen(val) && strcmp(val, "on") && strcmp(val, "ON")) | |
{ argh(callee, "option %s=%s is not recognised as \"--mode=on\"", key, val) | |
; break | |
; } | |
argv[argc++] = key | |
; } | |
else | |
{ argv[argc++] = key | |
; argv[argc++] = val | |
; } | |
} | |
} | |
; argv[argc] = NULL | |
; if (i != length(list)) | |
*res = 29 | |
; else | |
{ int j | |
; fprintf(stderr, "Passing to %s:", callee) | |
; for (j=0;j<argc;j++) | |
fprintf(stderr, " %s", argv[j]) | |
; fputc('\n', stderr) | |
; *res=themain(argc, argv) | |
; } | |
fputc('\n', stderr) | |
; UNPROTECT(1) | |
; return result | |
; } | |
SEXP reaperC | |
( SEXP list | |
) | |
{ return R_dispatchee(list, reaper_main, "reaper") | |
; } | |
#endif | |
#endif | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment