-
-
Save cadebrown/a949ed37fe15022c101cfe92f7abc72f to your computer and use it in GitHub Desktop.
mg (my-grep), an open-source, single-file implementation of regex & grep-like functionality from scratch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* mg.c - my-grep implementation, portable regex and search implementation in pure C | |
* | |
* should build on UNIX/Linux-like OSes no problem, should also work on Windows Subsystem for Linux (WSL) | |
* | |
* regex syntax: read the second comment block in this file | |
* | |
* compile like: | |
* $ cc mg.c -o mg | |
* | |
* then you can run like: | |
* | |
* $ ./mg 'a+b*c?d' file.txt other.txt | |
* ``` | |
* file.txt | 8 | abcd | |
* file.txt | 9 | ad | |
* file.txt | 10 | acd | |
* ``` | |
* | |
* You can view help/usage: | |
* $ ./mg -h | |
* ``` | |
* usage: mg [re] [src...=./] | |
* | |
* -h,--help prints this help/usage menu and exits | |
* -[DNLS] set the format columns ([Dd]ate, [Nn]ame, [Ll]ineno, [Ss]ource) | |
* | |
* see: https://cade.site/2021/09/28/diy-regex-engine | |
* author: Cade Brown <cade@cade.site> | |
* ``` | |
* | |
* you can also customize the format | |
* | |
* | |
* | |
* if you want to embed the regex library, you can see where it is split from library and CLI source code... | |
* right now it uses 'assert()' statements and hard exits on errors... obviously you'll have to inject | |
* your own error recovery/reporting | |
* | |
* use where-ever, but give me credit and email me a link to it | |
* | |
* @author: Cade Brown <cade@cade.site> | |
*/ | |
/*** REGEX SYNTAX | |
* | |
* here's the regex syntax: | |
* | |
* * a regex is made up of 0-or-more terms, connected by '|' (ior/pipe, which matches any of the terms) | |
* * a term is made up of one or more factors, connected by implicit concatenation | |
* * a factor is made up of: | |
* * a single character/escaped character | |
* * a special meta character: | |
* * '^': start of line | |
* * '$': end of line | |
* * '\s': any space character | |
* * '\d': any dogot | |
* * ... etc | |
* * a set/any (denoted like `[]`, for example `[a-zA-Z0-9_]`) | |
* * a parentheses, with an entire regex within it (for example, `(a+b)` is a single factor) | |
* * a factor may be followed by: | |
* * '?': this factor is optional to match (match 0 or 1 times) | |
* * '*': this factor can be matched 0-or-more times | |
* * '+': this factor can be matched 1-or-more times | |
* * '**{a,b,c:d}': this factor should be matched exactly a, b, or any number between and including c,d | |
* * for example, '**{2,5,8:10}' matches 2, 5, 8, 9, or 10 times | |
* | |
***/ | |
/// INCLUDES /// | |
// cstd | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
#include <stdbool.h> | |
// for STDIN_FILENO | |
#include <unistd.h> | |
// for assert() | |
#include <assert.h> | |
// for errno | |
#include <errno.h> | |
// for strcmp() | |
#include <string.h> | |
// for opendir() | |
#include <dirent.h> | |
// for stat() | |
#include <sys/stat.h> | |
// for strftime() | |
#include <time.h> | |
// define it ourself | |
#ifndef PATH_MAX | |
#define PATH_MAX 4096 | |
#endif | |
/// CONSTANTS /// | |
enum { | |
// epsilon transitions to {u,v} | |
MG_RE_NFA_EPS, | |
// match a signal character, which is a negative character | |
MG_RE_NFA_SIG, | |
// match a UCP character | |
// (no unicode for this demo, left as an exercise to the reader) | |
MG_RE_NFA_UCP, | |
// match a set of UCP characters | |
// see MG_RE_UCP | |
MG_RE_NFA_ANY, | |
}; | |
/// TYPES /// | |
// represents a single node within an NFA graph structure | |
// SEE: struct mg_re, which these are contained in | |
// NFA: https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton | |
struct mg_re_nfa { | |
// the kind of NFA node (see 'MG_RE_NFA_*') | |
int kind; | |
// outward edges on the NFA graph, or one of the special values: | |
// -1: empty/no edge | |
// -2: open edge that is unlinked | |
int u, v; | |
// tagged union, depending on 'kind' | |
union { | |
// character to match | |
// IF: 'kind==MG_RE_NFA_UCP' | |
int ucp; | |
// signal to match | |
// IF: 'kind==MG_RE_NFA_SIG' | |
int sig; | |
// structure describing a set | |
// IF: 'kind==MG_RE_NFA_ANY' | |
struct { | |
// whether or not the output is normal (false) or inverted (true) | |
// if inverted, | |
bool inv; | |
// bit-set of whether a given 8-bit character, assumes all are 8-bit | |
// or less | |
bool any[256]; | |
// TODO: if you want to implement unicode characters, somehow | |
// express that here | |
} any; | |
}; | |
}; | |
// represents a regular expression pattern, which is basically an NFA regex | |
typedef struct mg_re_s { | |
// the string it was generated from (for error diagonstics) | |
char* expr; | |
// number of NFA states (in 'nfa') | |
int nfa_len; | |
// array of NFA nodes | |
struct mg_re_nfa* nfa; | |
// the entry index into nfa | |
int entry; | |
}* mg_re; | |
// create a new regex from an expression | |
// NOTE: 'expr' should be NUL-terminated (and probably UTF-8, if | |
// you support unicode) | |
mg_re | |
mg_re_make(const char* expr); | |
// free the regular expression structure | |
void | |
mg_re_free(mg_re re); | |
// default iterator structure for a regular expression | |
struct mg_reit { | |
// regex we are iterating over | |
// NOTE: this should not change, once the structure is initialized! | |
mg_re re; | |
// a map of whether we're in the index, for example 'in[2]' means 're->nfa[2]' | |
// is an active state we are traversing | |
bool* in; | |
// temporary buffer, that is hot-swapped with 'in', used while | |
// traversing the NFA | |
bool* lastin; | |
}; | |
// initialize a regex iterator, from a regular expression | |
// NOTE: make sure to call 'mg_reit_free()' to clean up memory | |
void | |
mg_reit_init(struct mg_reit* reit, mg_re re); | |
// free any space used by the regex iterator | |
void | |
mg_reit_free(struct mg_reit* reit); | |
// reset to an empty state, without freeing any memory | |
void | |
mg_reit_reset(struct mg_reit* reit); | |
// simulate a linebegin event | |
bool | |
mg_reit_sig_linebegin(struct mg_reit* reit); | |
// simulate a lineend event | |
bool | |
mg_reit_sig_lineend(struct mg_reit* reit); | |
// add th entry state, return whether in a matching state | |
bool | |
mg_reit_addentry(struct mg_reit* reit); | |
// advance the iterator to the next character | |
// NOTE: returns whether 'reit' is in a matching state | |
bool | |
mg_reit_next(struct mg_reit* reit, int c); | |
/// MG IMPLEMENTATION /// | |
// NOTE: you could keep this part in a seperate file, and put the above in | |
// a header | |
// for example, if you did that to 'mg.h', you would uncomment the following line: | |
//#include <mg.h> | |
// mg_re impl // | |
// utility macro for position shorthand | |
#define POS (*pos) | |
// internal routine to push a new NFA with a given kind on the end of the regular expression | |
// NOTE: returns a pointer to the newly added element | |
static struct mg_re_nfa* | |
mg_re_push_(mg_re re, int kind) { | |
// reallocate array | |
int i = re->nfa_len++; | |
re->nfa = realloc(re->nfa, sizeof(*re->nfa) * re->nfa_len); | |
re->nfa[i].kind = kind; | |
// start out empty | |
re->nfa[i].u = -2; | |
re->nfa[i].v = -1; | |
return &re->nfa[i]; | |
} | |
// internal routine to replace a->b as edges within [start, stop) | |
static void | |
mg_re_link_(mg_re re, int start, int stop, int a, int b) { | |
// loop over selected range | |
int i; | |
for (i = start; i < stop; ++i) { | |
struct mg_re_nfa* nfa = &re->nfa[i]; | |
// replace a -> b | |
if (nfa->u == a) nfa->u = b; | |
if (nfa->v == a) nfa->v = b; | |
} | |
} | |
// internal replace-range function, for duplast | |
static int | |
mg_re_rr_(int x, int start, int stop, int diff) { | |
if (start <= x && x < stop) { | |
x += diff; | |
} | |
return x; | |
} | |
// internal routine to duplicate the range to the end | |
static void | |
mg_re_duprange_(mg_re re, int start, int stop) { | |
// reallocate array | |
int i = re->nfa_len; | |
re->nfa_len += stop - start; | |
re->nfa = realloc(re->nfa, sizeof(*re->nfa) * re->nfa_len); | |
// difference to add | |
int diff = i - start; | |
// copy over | |
memcpy(re->nfa + i, re->nfa + start, (stop - start) * sizeof(*re->nfa)); | |
// link all existing to the second one | |
//mg_re_link_(re, start, stop, -2, i); | |
// now, translate range | |
int j; | |
for (j = i; j < re->nfa_len; ++j) { | |
re->nfa[j].u = mg_re_rr_(re->nfa[j].u, start, stop, diff); | |
re->nfa[j].v = mg_re_rr_(re->nfa[j].v, start, stop, diff); | |
} | |
} | |
// partition | |
static int | |
mg_re_sorti_part_(int n, int* x) { | |
// pivot value | |
int piv = x[0]; | |
int i = -1, j, t; | |
for (j = 0; j < n; ++j) { | |
if (x[j] < piv) { | |
++i; | |
t = x[i]; | |
x[i] = x[j]; | |
x[j] = t; | |
} | |
} | |
// swap | |
t = x[i+1]; | |
x[i+1] = x[n-1]; | |
x[n - 1] = t; | |
return i + 1; | |
} | |
// sort, in-place 'x[0:n]' | |
static void | |
mg_re_sorti_(int n, int* x) { | |
// already sorted | |
if (n < 2) return; | |
int ipiv = mg_re_sorti_part_(n, x); | |
mg_re_sorti_(ipiv - 1, x); | |
mg_re_sorti_(n - ipiv, x + ipiv); | |
} | |
static int | |
mg_re_parse_(mg_re re, const char** pos, int lsl); | |
// internal routine to parse a factor and return the entry position, or -1 if an error | |
static int | |
mg_re_parse_factor_(mg_re re, const char** pos, int lsl) { | |
// starting length of the NFA, which is the node associate with the primary characte | |
int sl = re->nfa_len; | |
// result which is the entry position | |
int res = sl; | |
int c; | |
if (*POS == '(') { | |
POS++; | |
// parse entire structure | |
int e = mg_re_parse_(re, pos, lsl); | |
if (e < 0) return -1; | |
// result to entry | |
res = e; | |
// assume end | |
assert(*POS == ')'); | |
POS++; | |
} else if (*POS == '[') { | |
// set | |
POS++; | |
struct mg_re_nfa* nfa = mg_re_push_(re, MG_RE_NFA_ANY); | |
// '!' leading inverts the selection | |
nfa->any.inv = *POS == '!'; | |
if (nfa->any.inv) POS++; | |
while (*POS && *POS != ']') { | |
c = *POS; | |
POS++; | |
if (c == '\\') { | |
// escape code | |
c = *POS; | |
POS++; | |
if (c == '\\') { | |
nfa->any.any['\\'] = true; | |
} else if (c == 'n') { | |
nfa->any.any['\n'] = true; | |
} else if (c == 't') { | |
nfa->any.any['\t'] = true; | |
} else { | |
nfa->any.any[c] = true; | |
} | |
} else if (*POS == '-') { | |
// range specifier | |
POS++; | |
int cto = *POS; | |
POS++; | |
int i; | |
for (i = c; i <= cto; ++i) { | |
nfa->any.any[i] = true; | |
} | |
} else { | |
// just assume valid character | |
nfa->any.any[c] = true; | |
} | |
} | |
// assume end | |
assert(*POS == ']'); | |
POS++; | |
} else if (*POS == '^') { | |
POS++; | |
// linebegin magic number | |
mg_re_push_(re, MG_RE_NFA_SIG)->sig = -1; | |
} else if (*POS == '$') { | |
POS++; | |
// lineend magic number | |
mg_re_push_(re, MG_RE_NFA_SIG)->sig = -2; | |
} else { | |
c = *POS; | |
POS++; | |
mg_re_push_(re, MG_RE_NFA_UCP)->ucp = c; | |
} | |
// check modifier | |
if (*POS == '?') { | |
POS++; | |
// repeat [0, 1] times, badding an eps transition | |
int sl_ques = re->nfa_len; | |
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl; | |
re->nfa[sl_ques].v = sl; | |
// don't link anything | |
res = sl_ques; | |
} else if (*POS == '+') { | |
POS++; | |
// repeat [1, inf) times, by adding an epsilon transition back to self | |
int sl_plus = re->nfa_len; | |
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl; | |
re->nfa[sl_plus].v = sl; | |
// link all previous to 'sl_plus' | |
mg_re_link_(re, lsl, sl_plus, -2, sl_plus); | |
} else if (*POS == '*' && POS[1] == '*') { | |
POS++; | |
POS++; | |
assert(*POS == '{'); | |
POS++; | |
// n optional | |
int nopt = 0; | |
// optional iteration counts (NOTE: need to be stored) | |
int* opt = NULL; | |
while (*POS != '}') { | |
if (nopt > 0) { | |
assert(*POS == ','); | |
POS++; | |
} | |
int x = 0; | |
// take characters | |
c = *POS; | |
assert('0' <= c && c <= '9'); | |
do { | |
POS++; | |
// advance count | |
x = 10 * x + (c - '0'); | |
c = *POS; | |
} while ('0' <= c && c <= '9'); | |
// reallocation optional counts | |
int i = nopt++; | |
opt = realloc(opt, sizeof(*opt) * nopt); | |
opt[i] = x; | |
if (*POS == ':') { | |
POS++; | |
int y = 0; | |
// take characters | |
c = *POS; | |
assert('0' <= c && c <= '9'); | |
do { | |
POS++; | |
// advance count | |
y = 10 * y + (c - '0'); | |
c = *POS; | |
} while ('0' <= c && c <= '9'); | |
// add a bunch more optional ones | |
int off = nopt; | |
nopt += y - x - 1; | |
opt = realloc(opt, sizeof(*opt) * nopt); | |
for (i = 0; i < y - x - 1; ++i) { | |
opt[off + i] = x + i + 1; | |
} | |
} | |
} | |
// sort that | |
mg_re_sorti_(nopt, opt); | |
// expect end | |
assert(*POS == '}'); | |
POS++; | |
// number to repeat | |
int num_rep = re->nfa_len - sl; | |
// the position to repeat from | |
// NOTE: this changes each time | |
int pos_rep = sl; | |
// how many are required so far | |
int sofar = 1; | |
int lsl_opt = -1; | |
// now, sort in place | |
int i; | |
for (i = 0; i < nopt; ++i) { | |
int sl_opt = -1; | |
if (i > 0) { | |
// add optional component | |
sl_opt = re->nfa_len; | |
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl_opt + 1; | |
// link to the optional | |
mg_re_link_(re, pos_rep, pos_rep + num_rep, -2, sl_opt); | |
} | |
int j; | |
for (j = sofar; j < opt[i]; ++j) { | |
int sl_cur = re->nfa_len; | |
// repeat it | |
mg_re_duprange_(re, pos_rep, pos_rep + num_rep); | |
if (i == 0 || j != sofar) mg_re_link_(re, pos_rep, pos_rep + num_rep, -2, (res - sl) + sl_cur); | |
else mg_re_link_(re, sl_cur, sl_cur + num_rep, sl_opt, -2); | |
// repeat from this location | |
pos_rep = sl_cur; | |
} | |
// save state for next time | |
sofar = opt[i]; | |
} | |
} else if (*POS == '*') { | |
POS++; | |
// repeat [0, inf) times, by adding an epsilon transition back to self | |
// length of the star node | |
int sl_star = re->nfa_len; | |
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl; | |
// link all previous to 'sl_star' | |
mg_re_link_(re, lsl, sl_star, -2, sl_star); | |
// now, we need to enter at the star | |
res = sl_star; | |
} else { | |
// just character match, so we only added 1 state | |
mg_re_link_(re, lsl, sl, -2, sl); | |
} | |
// return entry point | |
return res; | |
} | |
// internal routine to parse a term, and return the entry position, or -1 if an error | |
static int | |
mg_re_parse_term_(mg_re re, const char** pos, int lsl) { | |
// entry point | |
int res = re->nfa_len; | |
int ct = 0; | |
while (*POS && *POS != '|' && *POS != ')') { | |
// starting length | |
int sl = re->nfa_len; | |
// parse another factor, get entry point | |
int e = mg_re_parse_factor_(re, pos, sl); | |
if (e < 0) return e; | |
if (ct > 0) { | |
// link all previous to the entry point | |
mg_re_link_(re, lsl, sl, -2, e); | |
} else { | |
// return entry point | |
res = e; | |
} | |
// restart now | |
lsl = sl; | |
ct++; | |
} | |
return res; | |
} | |
// internal routine to parse an entire routine, and return entry point | |
static int | |
mg_re_parse_(mg_re re, const char** pos, int lsl) { | |
int res = -1; | |
int ct = 0; | |
while (*POS && *POS != ')') { | |
int sl = re->nfa_len; | |
// capture | |
int ssl = sl; | |
while (*POS && *POS != '|' && *POS != ')') { | |
int sl_cur = re->nfa_len; | |
// create a bunch of terms | |
int e = mg_re_parse_term_(re, pos, sl_cur); | |
if (e < 0) return e; | |
if (res < 0) res = e; | |
// link previous to this entry point | |
mg_re_link_(re, ssl, sl_cur, -2, e); | |
ssl = sl_cur; | |
} | |
if (ct > 0) { | |
// add an epsilon that is the result | |
int sl_opt = re->nfa_len; | |
struct mg_re_nfa* nfa = mg_re_push_(re, MG_RE_NFA_EPS); | |
// link back to the previous | |
nfa->u = sl; | |
nfa->v = lsl; | |
res = sl_opt; | |
} | |
// end here | |
ct++; | |
if (*POS != '|') break; | |
// advance past it | |
assert(*POS == '|'); | |
POS++; | |
lsl = sl; | |
} | |
if (res < 0) res = 0; | |
return res; | |
} | |
mg_re | |
mg_re_make(const char* expr) { | |
mg_re re = malloc(sizeof(*re)); | |
assert(re != NULL); | |
// copy expression | |
int sl = strlen(expr); | |
re->expr = malloc(sl + 1); | |
assert(re->expr != NULL); | |
strcpy(re->expr, expr); | |
// start with an empty graph | |
re->nfa_len = 0; | |
re->nfa = NULL; | |
// parse in-place | |
re->entry = mg_re_parse_(re, &expr, 0); | |
if (re->entry < 0) { | |
mg_re_free(re); | |
return NULL; | |
} | |
//printf("[%i], entry: %i\n", re->nfa_len, re->entry); | |
//printf("%i,%c: %i, %i\n", re->nfa[0].kind, re->nfa[0].ucp, re->nfa[0].u, re->nfa[0].v); | |
//printf("%i, %i\n", re->nfa[1].u, re->nfa[1].v); | |
//printf("%i, %i\n", re->nfa[2].u, re->nfa[2].v); | |
//printf("%i, %i\n", re->nfa[3].u, re->nfa[3].v); | |
return re; | |
} | |
void | |
mg_re_free(mg_re re) { | |
free(re->expr); | |
free(re->nfa); | |
free(re); | |
} | |
// mg_reit // | |
void | |
mg_reit_init(struct mg_reit* reit, mg_re re) { | |
reit->re = re; | |
reit->in = malloc(sizeof(*reit->in) * reit->re->nfa_len); | |
assert(reit->in != NULL); | |
reit->lastin = malloc(sizeof(*reit->lastin) * reit->re->nfa_len); | |
assert(reit->lastin != NULL); | |
// reset data | |
mg_reit_reset(reit); | |
} | |
void | |
mg_reit_free(struct mg_reit* reit) { | |
free(reit->in); | |
free(reit->lastin); | |
} | |
void | |
mg_reit_reset(struct mg_reit* reit) { | |
int i; | |
for (i = 0; i < reit->re->nfa_len; ++i) { | |
reit->lastin[i] = reit->in[i] = false; | |
} | |
} | |
// internal routine to add a state (set bm[i]), possibly following other transitions | |
// NOTE: returns whether we are in a matching state | |
static bool | |
mg_reit_add_(struct mg_reit* reit, int s) { | |
if (s == -1) { | |
// -1: empty/nothing further | |
return false; | |
} else if (s == -2) { | |
// -2: accept/match | |
return true; | |
} else { | |
// should have handled all special values | |
assert(s >= 0); | |
// should be a valid index | |
assert(s < reit->re->nfa_len); | |
// add children (which may be empty/invalid) | |
bool res = false; | |
if (reit->re->nfa[s].kind == MG_RE_NFA_EPS) { | |
// epsilon, so add children instead | |
if (mg_reit_add_(reit, reit->re->nfa[s].u)) res = true; | |
if (mg_reit_add_(reit, reit->re->nfa[s].v)) res = true; | |
} else { | |
// not epsilon, so add transition directly | |
reit->in[s] = true; | |
} | |
return res; | |
} | |
} | |
bool | |
mg_reit_sig_linebegin(struct mg_reit* reit) { | |
return mg_reit_next(reit, -1 /* magic value */); | |
} | |
bool | |
mg_reit_sig_lineend(struct mg_reit* reit) { | |
return mg_reit_next(reit, -2 /* magic value */); | |
} | |
bool | |
mg_reit_addentry(struct mg_reit* reit) { | |
// match empty | |
if (reit->re->nfa_len <= 0) return true; | |
return mg_reit_add_(reit, reit->re->entry); | |
} | |
bool | |
mg_reit_next(struct mg_reit* reit, int c) { | |
// NOTE: no unicode support here! | |
assert(c < 256); | |
// swap the buffers, since we are doing the next iteration | |
bool* intmp = reit->in; | |
reit->in = reit->lastin; | |
reit->lastin = intmp; | |
// reset the current input | |
int i; | |
for (i = 0; i < reit->re->nfa_len; ++i) { | |
reit->in[i] = false; | |
} | |
// what to return, whether we are in an accepting state | |
bool res = false; | |
// just normal character | |
for (i = 0; i < reit->re->nfa_len; ++i) { | |
if (reit->lastin[i]) { | |
// we are in the current state | |
struct mg_re_nfa* nfa = &reit->re->nfa[i]; | |
bool good = false; | |
if (nfa->kind == MG_RE_NFA_SIG) { | |
// accepts a special, negative value to feed | |
good = c == nfa->sig; | |
} else if (nfa->kind == MG_RE_NFA_UCP && c >= 0) { | |
// see if we match the current character | |
good = nfa->ucp == c; | |
} else if (nfa->kind == MG_RE_NFA_ANY && c >= 0) { | |
// NOTE: assumes c < 256 | |
good = nfa->any.any[c]; | |
// apply inversion | |
if (nfa->any.inv) good = !good; | |
} | |
if (good) { | |
// add the states | |
if (mg_reit_add_(reit, nfa->u)) res = true; | |
if (mg_reit_add_(reit, nfa->v)) res = true; | |
} | |
} | |
} | |
return res; | |
} | |
/// MG CLI (i.e. you don't need to copy this into your project) /// | |
// exit with usage message | |
static void | |
e_usage(bool isbad) { | |
fprintf(stderr, "usage: mg [re] [src...=./]\n"); | |
fprintf(stderr, "\n"); | |
fprintf(stderr, " -h,--help prints this help/usage menu and exits\n"); | |
fprintf(stderr, " -[DNLS] set the format columns ([Dd]ate, [Nn]ame, [Ll]ineno, [Ss]ource)\n"); | |
fprintf(stderr, "\n"); | |
fprintf(stderr, "see: https://cade.site/2021/09/28/diy-regex-engine\n"); | |
fprintf(stderr, "author: Cade Brown <cade@cade.site>\n"); | |
// exit the process | |
exit(isbad ? EXIT_FAILURE : EXIT_SUCCESS); | |
} | |
// exit with errno value | |
static void | |
e_errno(int errnoval, const char* msg) { | |
fprintf(stderr, "mg: %s: %s\n", msg, strerror(errnoval)); | |
exit(EXIT_FAILURE); | |
} | |
// format match | |
static void | |
p_fmt(const char* fmt, const char* src, struct stat* sb, const char* line, int lineno) { | |
char c; | |
int ct = 0; | |
while ((c = *fmt) != '\0') { | |
fmt++; | |
if (c == 'L' || c == 'l') { | |
if (ct > 0) printf(" | "); | |
// lineno display | |
printf("%4i", lineno+1/*1-based*/); | |
} else if (c == 'S' || c == 's') { | |
if (ct > 0) printf(" | "); | |
// source display | |
// NOTE: we don't want newline | |
printf("%.*s", (int)(strlen(line) - 1), line); | |
} else if (c == 'N' || c == 'n') { | |
if (ct > 0) printf(" | "); | |
// source name | |
// NOTE: we don't want newline | |
printf("%-16s", src); | |
} else if (c == 'D' || c == 'd') { | |
if (ct > 0) printf(" | "); | |
// date name | |
char tmp[256]; | |
// print in ISO8601 | |
strftime(tmp, sizeof(tmp), "%FT%T", gmtime(&sb->st_mtime)); | |
printf("%s", tmp); | |
} | |
ct++; | |
} | |
// finish with a newline | |
printf("\n"); | |
//printf("%4i | %-12s | %s", lineno+1/* 1-based */, src, line); | |
} | |
// utility function to read an entire line of text into '*pline' (of length '*n'), | |
// automatically reallocating | |
// NOTE: returns the length of the line | |
static int | |
u_getline(FILE* fp, int* n, char** pline) { | |
if (!*pline) { | |
// need to allocate some space, so start with 1024 | |
if ((*pline = malloc(*n = 1024)) == NULL) { | |
e_errno(errno, __func__); | |
} | |
} | |
// iterate over characters until we hit the end or a newline | |
char* pos = *pline; | |
int c; | |
while ((c = fgetc(fp)) != EOF && (*pos++ = c) != '\n') { | |
int off = pos - *pline; | |
if (off >= *n) { | |
// we need to reallocate | |
// this is a pretty good hueristic for most purposes | |
*n = off * 2 + 8; | |
if ((*pline = realloc(*pline, *n)) == NULL) { | |
e_errno(errno, __func__); | |
} | |
// recalculate the other values | |
pos = *pline + off; | |
} | |
} | |
if (pos != *pline && pos[-1] != '\n') { | |
// insert newline | |
*pos = '\n'; | |
pos++; | |
} | |
// NUL-terminate | |
*pos = '\0'; | |
return pos - *pline; | |
} | |
// perform search on 'src' file, with the given pattern and iter structure | |
static void | |
do_file(const char* fmt, const char* src, FILE* fp, struct stat* sb, mg_re re, struct mg_reit* reit, int* n, char** pline) { | |
// attempt to open 'fp' | |
if (!fp) { | |
fp = fopen(src, "r"); | |
if (!fp) { | |
e_errno(errno, src); | |
} | |
} | |
// we need to reset our state | |
mg_reit_reset(reit); | |
// line number | |
int lineno = 0; | |
while (true) { | |
// get the next line within the file | |
int len = u_getline(fp, n, pline); | |
if (!len) { | |
// empty line means the end of file has been reached | |
break; | |
} else if (len < 0) { | |
// problem | |
e_errno(errno, src); | |
} | |
// whether or not the current line should be printed | |
bool shouldprint = false; | |
// we want to try to match at every position | |
if (mg_reit_addentry(reit)) { | |
shouldprint = true; | |
} | |
// simulate a linebegin | |
if (mg_reit_sig_linebegin(reit)) { | |
shouldprint = true; | |
} | |
// now, feed the line | |
int i, c; | |
for (i = 0; i < len; ++i) { | |
c = (*pline)[i]; | |
if (c == '\n') break; | |
// we want to try to match at every position | |
if (mg_reit_addentry(reit)) { | |
shouldprint = true; | |
} | |
// add the character | |
if (mg_reit_next(reit, c)) { | |
// in a matching state, so print it out | |
shouldprint = true; | |
} | |
} | |
// we want to try to match at every position | |
if (mg_reit_addentry(reit)) { | |
shouldprint = true; | |
} | |
// simulate a lineend | |
if (mg_reit_sig_lineend(reit)) { | |
shouldprint = true; | |
} | |
// now, print out if we've matched | |
if (shouldprint) { | |
p_fmt(fmt, src, sb, *pline, lineno); | |
// for plain format | |
//printf("%s", *pline); | |
} | |
lineno++; | |
} | |
// done with the file | |
fclose(fp); | |
} | |
static void | |
do_src(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline); | |
// perform search for 'src' recursively | |
static void | |
do_dir(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline) { | |
DIR* dp = opendir(src); | |
if (!dp) { | |
e_errno(errno, src); | |
} | |
// temporary buffer | |
char tmp[PATH_MAX]; | |
// iterate over children | |
struct dirent* de; | |
while ((de = readdir(dp)) != NULL) { | |
char* name = de->d_name; | |
if (strcmp(".", name) == 0 || strcmp("..", name) == 0) { | |
// ignore | |
} else { | |
// generate name and execute it | |
snprintf(tmp, PATH_MAX, "%s/%s", src, name); | |
do_src(fmt, tmp, re, reit, n, pline); | |
} | |
} | |
// done | |
closedir(dp); | |
} | |
// perform generic search | |
static void | |
do_src(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline) { | |
if (strcmp(src, "-") == 0) { // stdin, special case | |
struct stat sb; | |
if (fstat(STDIN_FILENO, &sb) < 0) { | |
e_errno(errno, src); | |
} | |
do_file(fmt, src, stdin, &sb, re, reit, n, pline); | |
return; | |
} | |
// check what 'src' is | |
struct stat sb; | |
if (stat(src, &sb) < 0) { | |
e_errno(errno, src); | |
} | |
if (S_ISREG(sb.st_mode)) { | |
// regular file | |
do_file(fmt, src, NULL, &sb, re, reit, n, pline); | |
} else { | |
// directory | |
do_dir(fmt, src, re, reit, n, pline); | |
} | |
} | |
// default list of sources to search | |
static const char* default_src[] = { | |
"-", | |
}; | |
// default format (name, lineno, src) | |
static const char* default_fmt = "NLS"; | |
int main(int argc, const char** argv) { | |
// requires arguments | |
if (argc < 2) e_usage(true); | |
// now, pull off the expr | |
const char* expr = argv[1]; | |
const char* fmt = default_fmt; | |
int i, j; | |
for (j = i = 1; i < argc; ++i) { | |
if (strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[1]) == 0) { | |
e_usage(false); | |
} else if (argv[i][0] == '-' && argv[i][1] != '\0') { | |
// claim next as format | |
fmt = argv[i] + 1; | |
// shortcuts | |
if (strcmp("d", fmt) == 0 || strcmp("D", fmt) == 0) { | |
fmt = "DNLS"; | |
} | |
} else { | |
// copy in and compact | |
argv[j++] = argv[i]; | |
} | |
} | |
// account for arguments | |
argc = j; | |
// compile the expression | |
mg_re re = mg_re_make(expr); | |
if (!re) { | |
// error should have been reported | |
return EXIT_FAILURE; | |
} | |
// regex iterator, from our pattern | |
struct mg_reit reit; | |
mg_reit_init(&reit, re); | |
// try to use the arguments passed in | |
int nsrc = argc - 2; | |
const char** src = argv + 2; | |
if (nsrc == 0) { | |
// none were given, so use the default sources: | |
nsrc = sizeof(default_src) / sizeof(*default_src); | |
src = default_src; | |
} | |
// for u_getline() | |
int n = 0; | |
char* pline = NULL; | |
for (i = 0; i < nsrc; ++i) { | |
// perform generic search | |
do_src(fmt, src[i], re, &reit, &n, &pline); | |
} | |
// you don't really need to at the end of 'main()', but its a good habit | |
// to free your resources | |
mg_re_free(re); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
a | |
aa | |
aaa | |
aaaa | |
aaaaa | |
b | |
c | |
ab | |
ac | |
bc | |
abc | |
aaabbb | |
cat | |
bat | |
rat | |
mat | |
map | |
trap | |
crap | |
back | |
wack | |
stack |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment