Skip to content

Instantly share code, notes, and snippets.

@cadebrown
Last active October 6, 2021 03:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cadebrown/a949ed37fe15022c101cfe92f7abc72f to your computer and use it in GitHub Desktop.
Save cadebrown/a949ed37fe15022c101cfe92f7abc72f to your computer and use it in GitHub Desktop.
mg (my-grep), an open-source, single-file implementation of regex & grep-like functionality from scratch
/* mg.c - my-grep implementation, portable regex and search implementation in pure C
*
* should build on UNIX/Linux-like OSes no problem, should also work on Windows Subsystem for Linux (WSL)
*
* regex syntax: read the second comment block in this file
*
* compile like:
* $ cc mg.c -o mg
*
* then you can run like:
*
* $ ./mg 'a+b*c?d' file.txt other.txt
* ```
* file.txt | 8 | abcd
* file.txt | 9 | ad
* file.txt | 10 | acd
* ```
*
* You can view help/usage:
* $ ./mg -h
* ```
* usage: mg [re] [src...=./]
*
* -h,--help prints this help/usage menu and exits
* -[DNLS] set the format columns ([Dd]ate, [Nn]ame, [Ll]ineno, [Ss]ource)
*
* see: https://cade.site/2021/09/28/diy-regex-engine
* author: Cade Brown <cade@cade.site>
* ```
*
* you can also customize the format
*
*
*
* if you want to embed the regex library, you can see where it is split from library and CLI source code...
* right now it uses 'assert()' statements and hard exits on errors... obviously you'll have to inject
* your own error recovery/reporting
*
* use where-ever, but give me credit and email me a link to it
*
* @author: Cade Brown <cade@cade.site>
*/
/*** REGEX SYNTAX
*
* here's the regex syntax:
*
* * a regex is made up of 0-or-more terms, connected by '|' (ior/pipe, which matches any of the terms)
* * a term is made up of one or more factors, connected by implicit concatenation
* * a factor is made up of:
* * a single character/escaped character
* * a special meta character:
* * '^': start of line
* * '$': end of line
* * '\s': any space character
* * '\d': any dogot
* * ... etc
* * a set/any (denoted like `[]`, for example `[a-zA-Z0-9_]`)
* * a parentheses, with an entire regex within it (for example, `(a+b)` is a single factor)
* * a factor may be followed by:
* * '?': this factor is optional to match (match 0 or 1 times)
* * '*': this factor can be matched 0-or-more times
* * '+': this factor can be matched 1-or-more times
* * '**{a,b,c:d}': this factor should be matched exactly a, b, or any number between and including c,d
* * for example, '**{2,5,8:10}' matches 2, 5, 8, 9, or 10 times
*
***/
/// INCLUDES ///
// cstd
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
// for STDIN_FILENO
#include <unistd.h>
// for assert()
#include <assert.h>
// for errno
#include <errno.h>
// for strcmp()
#include <string.h>
// for opendir()
#include <dirent.h>
// for stat()
#include <sys/stat.h>
// for strftime()
#include <time.h>
// define it ourself
#ifndef PATH_MAX
#define PATH_MAX 4096
#endif
/// CONSTANTS ///
enum {
// epsilon transitions to {u,v}
MG_RE_NFA_EPS,
// match a signal character, which is a negative character
MG_RE_NFA_SIG,
// match a UCP character
// (no unicode for this demo, left as an exercise to the reader)
MG_RE_NFA_UCP,
// match a set of UCP characters
// see MG_RE_UCP
MG_RE_NFA_ANY,
};
/// TYPES ///
// represents a single node within an NFA graph structure
// SEE: struct mg_re, which these are contained in
// NFA: https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton
struct mg_re_nfa {
// the kind of NFA node (see 'MG_RE_NFA_*')
int kind;
// outward edges on the NFA graph, or one of the special values:
// -1: empty/no edge
// -2: open edge that is unlinked
int u, v;
// tagged union, depending on 'kind'
union {
// character to match
// IF: 'kind==MG_RE_NFA_UCP'
int ucp;
// signal to match
// IF: 'kind==MG_RE_NFA_SIG'
int sig;
// structure describing a set
// IF: 'kind==MG_RE_NFA_ANY'
struct {
// whether or not the output is normal (false) or inverted (true)
// if inverted,
bool inv;
// bit-set of whether a given 8-bit character, assumes all are 8-bit
// or less
bool any[256];
// TODO: if you want to implement unicode characters, somehow
// express that here
} any;
};
};
// represents a regular expression pattern, which is basically an NFA regex
typedef struct mg_re_s {
// the string it was generated from (for error diagonstics)
char* expr;
// number of NFA states (in 'nfa')
int nfa_len;
// array of NFA nodes
struct mg_re_nfa* nfa;
// the entry index into nfa
int entry;
}* mg_re;
// create a new regex from an expression
// NOTE: 'expr' should be NUL-terminated (and probably UTF-8, if
// you support unicode)
mg_re
mg_re_make(const char* expr);
// free the regular expression structure
void
mg_re_free(mg_re re);
// default iterator structure for a regular expression
struct mg_reit {
// regex we are iterating over
// NOTE: this should not change, once the structure is initialized!
mg_re re;
// a map of whether we're in the index, for example 'in[2]' means 're->nfa[2]'
// is an active state we are traversing
bool* in;
// temporary buffer, that is hot-swapped with 'in', used while
// traversing the NFA
bool* lastin;
};
// initialize a regex iterator, from a regular expression
// NOTE: make sure to call 'mg_reit_free()' to clean up memory
void
mg_reit_init(struct mg_reit* reit, mg_re re);
// free any space used by the regex iterator
void
mg_reit_free(struct mg_reit* reit);
// reset to an empty state, without freeing any memory
void
mg_reit_reset(struct mg_reit* reit);
// simulate a linebegin event
bool
mg_reit_sig_linebegin(struct mg_reit* reit);
// simulate a lineend event
bool
mg_reit_sig_lineend(struct mg_reit* reit);
// add th entry state, return whether in a matching state
bool
mg_reit_addentry(struct mg_reit* reit);
// advance the iterator to the next character
// NOTE: returns whether 'reit' is in a matching state
bool
mg_reit_next(struct mg_reit* reit, int c);
/// MG IMPLEMENTATION ///
// NOTE: you could keep this part in a seperate file, and put the above in
// a header
// for example, if you did that to 'mg.h', you would uncomment the following line:
//#include <mg.h>
// mg_re impl //
// utility macro for position shorthand
#define POS (*pos)
// internal routine to push a new NFA with a given kind on the end of the regular expression
// NOTE: returns a pointer to the newly added element
static struct mg_re_nfa*
mg_re_push_(mg_re re, int kind) {
// reallocate array
int i = re->nfa_len++;
re->nfa = realloc(re->nfa, sizeof(*re->nfa) * re->nfa_len);
re->nfa[i].kind = kind;
// start out empty
re->nfa[i].u = -2;
re->nfa[i].v = -1;
return &re->nfa[i];
}
// internal routine to replace a->b as edges within [start, stop)
static void
mg_re_link_(mg_re re, int start, int stop, int a, int b) {
// loop over selected range
int i;
for (i = start; i < stop; ++i) {
struct mg_re_nfa* nfa = &re->nfa[i];
// replace a -> b
if (nfa->u == a) nfa->u = b;
if (nfa->v == a) nfa->v = b;
}
}
// internal replace-range function, for duplast
static int
mg_re_rr_(int x, int start, int stop, int diff) {
if (start <= x && x < stop) {
x += diff;
}
return x;
}
// internal routine to duplicate the range to the end
static void
mg_re_duprange_(mg_re re, int start, int stop) {
// reallocate array
int i = re->nfa_len;
re->nfa_len += stop - start;
re->nfa = realloc(re->nfa, sizeof(*re->nfa) * re->nfa_len);
// difference to add
int diff = i - start;
// copy over
memcpy(re->nfa + i, re->nfa + start, (stop - start) * sizeof(*re->nfa));
// link all existing to the second one
//mg_re_link_(re, start, stop, -2, i);
// now, translate range
int j;
for (j = i; j < re->nfa_len; ++j) {
re->nfa[j].u = mg_re_rr_(re->nfa[j].u, start, stop, diff);
re->nfa[j].v = mg_re_rr_(re->nfa[j].v, start, stop, diff);
}
}
// partition
static int
mg_re_sorti_part_(int n, int* x) {
// pivot value
int piv = x[0];
int i = -1, j, t;
for (j = 0; j < n; ++j) {
if (x[j] < piv) {
++i;
t = x[i];
x[i] = x[j];
x[j] = t;
}
}
// swap
t = x[i+1];
x[i+1] = x[n-1];
x[n - 1] = t;
return i + 1;
}
// sort, in-place 'x[0:n]'
static void
mg_re_sorti_(int n, int* x) {
// already sorted
if (n < 2) return;
int ipiv = mg_re_sorti_part_(n, x);
mg_re_sorti_(ipiv - 1, x);
mg_re_sorti_(n - ipiv, x + ipiv);
}
static int
mg_re_parse_(mg_re re, const char** pos, int lsl);
// internal routine to parse a factor and return the entry position, or -1 if an error
static int
mg_re_parse_factor_(mg_re re, const char** pos, int lsl) {
// starting length of the NFA, which is the node associate with the primary characte
int sl = re->nfa_len;
// result which is the entry position
int res = sl;
int c;
if (*POS == '(') {
POS++;
// parse entire structure
int e = mg_re_parse_(re, pos, lsl);
if (e < 0) return -1;
// result to entry
res = e;
// assume end
assert(*POS == ')');
POS++;
} else if (*POS == '[') {
// set
POS++;
struct mg_re_nfa* nfa = mg_re_push_(re, MG_RE_NFA_ANY);
// '!' leading inverts the selection
nfa->any.inv = *POS == '!';
if (nfa->any.inv) POS++;
while (*POS && *POS != ']') {
c = *POS;
POS++;
if (c == '\\') {
// escape code
c = *POS;
POS++;
if (c == '\\') {
nfa->any.any['\\'] = true;
} else if (c == 'n') {
nfa->any.any['\n'] = true;
} else if (c == 't') {
nfa->any.any['\t'] = true;
} else {
nfa->any.any[c] = true;
}
} else if (*POS == '-') {
// range specifier
POS++;
int cto = *POS;
POS++;
int i;
for (i = c; i <= cto; ++i) {
nfa->any.any[i] = true;
}
} else {
// just assume valid character
nfa->any.any[c] = true;
}
}
// assume end
assert(*POS == ']');
POS++;
} else if (*POS == '^') {
POS++;
// linebegin magic number
mg_re_push_(re, MG_RE_NFA_SIG)->sig = -1;
} else if (*POS == '$') {
POS++;
// lineend magic number
mg_re_push_(re, MG_RE_NFA_SIG)->sig = -2;
} else {
c = *POS;
POS++;
mg_re_push_(re, MG_RE_NFA_UCP)->ucp = c;
}
// check modifier
if (*POS == '?') {
POS++;
// repeat [0, 1] times, badding an eps transition
int sl_ques = re->nfa_len;
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl;
re->nfa[sl_ques].v = sl;
// don't link anything
res = sl_ques;
} else if (*POS == '+') {
POS++;
// repeat [1, inf) times, by adding an epsilon transition back to self
int sl_plus = re->nfa_len;
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl;
re->nfa[sl_plus].v = sl;
// link all previous to 'sl_plus'
mg_re_link_(re, lsl, sl_plus, -2, sl_plus);
} else if (*POS == '*' && POS[1] == '*') {
POS++;
POS++;
assert(*POS == '{');
POS++;
// n optional
int nopt = 0;
// optional iteration counts (NOTE: need to be stored)
int* opt = NULL;
while (*POS != '}') {
if (nopt > 0) {
assert(*POS == ',');
POS++;
}
int x = 0;
// take characters
c = *POS;
assert('0' <= c && c <= '9');
do {
POS++;
// advance count
x = 10 * x + (c - '0');
c = *POS;
} while ('0' <= c && c <= '9');
// reallocation optional counts
int i = nopt++;
opt = realloc(opt, sizeof(*opt) * nopt);
opt[i] = x;
if (*POS == ':') {
POS++;
int y = 0;
// take characters
c = *POS;
assert('0' <= c && c <= '9');
do {
POS++;
// advance count
y = 10 * y + (c - '0');
c = *POS;
} while ('0' <= c && c <= '9');
// add a bunch more optional ones
int off = nopt;
nopt += y - x - 1;
opt = realloc(opt, sizeof(*opt) * nopt);
for (i = 0; i < y - x - 1; ++i) {
opt[off + i] = x + i + 1;
}
}
}
// sort that
mg_re_sorti_(nopt, opt);
// expect end
assert(*POS == '}');
POS++;
// number to repeat
int num_rep = re->nfa_len - sl;
// the position to repeat from
// NOTE: this changes each time
int pos_rep = sl;
// how many are required so far
int sofar = 1;
int lsl_opt = -1;
// now, sort in place
int i;
for (i = 0; i < nopt; ++i) {
int sl_opt = -1;
if (i > 0) {
// add optional component
sl_opt = re->nfa_len;
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl_opt + 1;
// link to the optional
mg_re_link_(re, pos_rep, pos_rep + num_rep, -2, sl_opt);
}
int j;
for (j = sofar; j < opt[i]; ++j) {
int sl_cur = re->nfa_len;
// repeat it
mg_re_duprange_(re, pos_rep, pos_rep + num_rep);
if (i == 0 || j != sofar) mg_re_link_(re, pos_rep, pos_rep + num_rep, -2, (res - sl) + sl_cur);
else mg_re_link_(re, sl_cur, sl_cur + num_rep, sl_opt, -2);
// repeat from this location
pos_rep = sl_cur;
}
// save state for next time
sofar = opt[i];
}
} else if (*POS == '*') {
POS++;
// repeat [0, inf) times, by adding an epsilon transition back to self
// length of the star node
int sl_star = re->nfa_len;
mg_re_push_(re, MG_RE_NFA_EPS)->v = sl;
// link all previous to 'sl_star'
mg_re_link_(re, lsl, sl_star, -2, sl_star);
// now, we need to enter at the star
res = sl_star;
} else {
// just character match, so we only added 1 state
mg_re_link_(re, lsl, sl, -2, sl);
}
// return entry point
return res;
}
// internal routine to parse a term, and return the entry position, or -1 if an error
static int
mg_re_parse_term_(mg_re re, const char** pos, int lsl) {
// entry point
int res = re->nfa_len;
int ct = 0;
while (*POS && *POS != '|' && *POS != ')') {
// starting length
int sl = re->nfa_len;
// parse another factor, get entry point
int e = mg_re_parse_factor_(re, pos, sl);
if (e < 0) return e;
if (ct > 0) {
// link all previous to the entry point
mg_re_link_(re, lsl, sl, -2, e);
} else {
// return entry point
res = e;
}
// restart now
lsl = sl;
ct++;
}
return res;
}
// internal routine to parse an entire routine, and return entry point
static int
mg_re_parse_(mg_re re, const char** pos, int lsl) {
int res = -1;
int ct = 0;
while (*POS && *POS != ')') {
int sl = re->nfa_len;
// capture
int ssl = sl;
while (*POS && *POS != '|' && *POS != ')') {
int sl_cur = re->nfa_len;
// create a bunch of terms
int e = mg_re_parse_term_(re, pos, sl_cur);
if (e < 0) return e;
if (res < 0) res = e;
// link previous to this entry point
mg_re_link_(re, ssl, sl_cur, -2, e);
ssl = sl_cur;
}
if (ct > 0) {
// add an epsilon that is the result
int sl_opt = re->nfa_len;
struct mg_re_nfa* nfa = mg_re_push_(re, MG_RE_NFA_EPS);
// link back to the previous
nfa->u = sl;
nfa->v = lsl;
res = sl_opt;
}
// end here
ct++;
if (*POS != '|') break;
// advance past it
assert(*POS == '|');
POS++;
lsl = sl;
}
if (res < 0) res = 0;
return res;
}
mg_re
mg_re_make(const char* expr) {
mg_re re = malloc(sizeof(*re));
assert(re != NULL);
// copy expression
int sl = strlen(expr);
re->expr = malloc(sl + 1);
assert(re->expr != NULL);
strcpy(re->expr, expr);
// start with an empty graph
re->nfa_len = 0;
re->nfa = NULL;
// parse in-place
re->entry = mg_re_parse_(re, &expr, 0);
if (re->entry < 0) {
mg_re_free(re);
return NULL;
}
//printf("[%i], entry: %i\n", re->nfa_len, re->entry);
//printf("%i,%c: %i, %i\n", re->nfa[0].kind, re->nfa[0].ucp, re->nfa[0].u, re->nfa[0].v);
//printf("%i, %i\n", re->nfa[1].u, re->nfa[1].v);
//printf("%i, %i\n", re->nfa[2].u, re->nfa[2].v);
//printf("%i, %i\n", re->nfa[3].u, re->nfa[3].v);
return re;
}
void
mg_re_free(mg_re re) {
free(re->expr);
free(re->nfa);
free(re);
}
// mg_reit //
void
mg_reit_init(struct mg_reit* reit, mg_re re) {
reit->re = re;
reit->in = malloc(sizeof(*reit->in) * reit->re->nfa_len);
assert(reit->in != NULL);
reit->lastin = malloc(sizeof(*reit->lastin) * reit->re->nfa_len);
assert(reit->lastin != NULL);
// reset data
mg_reit_reset(reit);
}
void
mg_reit_free(struct mg_reit* reit) {
free(reit->in);
free(reit->lastin);
}
void
mg_reit_reset(struct mg_reit* reit) {
int i;
for (i = 0; i < reit->re->nfa_len; ++i) {
reit->lastin[i] = reit->in[i] = false;
}
}
// internal routine to add a state (set bm[i]), possibly following other transitions
// NOTE: returns whether we are in a matching state
static bool
mg_reit_add_(struct mg_reit* reit, int s) {
if (s == -1) {
// -1: empty/nothing further
return false;
} else if (s == -2) {
// -2: accept/match
return true;
} else {
// should have handled all special values
assert(s >= 0);
// should be a valid index
assert(s < reit->re->nfa_len);
// add children (which may be empty/invalid)
bool res = false;
if (reit->re->nfa[s].kind == MG_RE_NFA_EPS) {
// epsilon, so add children instead
if (mg_reit_add_(reit, reit->re->nfa[s].u)) res = true;
if (mg_reit_add_(reit, reit->re->nfa[s].v)) res = true;
} else {
// not epsilon, so add transition directly
reit->in[s] = true;
}
return res;
}
}
bool
mg_reit_sig_linebegin(struct mg_reit* reit) {
return mg_reit_next(reit, -1 /* magic value */);
}
bool
mg_reit_sig_lineend(struct mg_reit* reit) {
return mg_reit_next(reit, -2 /* magic value */);
}
bool
mg_reit_addentry(struct mg_reit* reit) {
// match empty
if (reit->re->nfa_len <= 0) return true;
return mg_reit_add_(reit, reit->re->entry);
}
bool
mg_reit_next(struct mg_reit* reit, int c) {
// NOTE: no unicode support here!
assert(c < 256);
// swap the buffers, since we are doing the next iteration
bool* intmp = reit->in;
reit->in = reit->lastin;
reit->lastin = intmp;
// reset the current input
int i;
for (i = 0; i < reit->re->nfa_len; ++i) {
reit->in[i] = false;
}
// what to return, whether we are in an accepting state
bool res = false;
// just normal character
for (i = 0; i < reit->re->nfa_len; ++i) {
if (reit->lastin[i]) {
// we are in the current state
struct mg_re_nfa* nfa = &reit->re->nfa[i];
bool good = false;
if (nfa->kind == MG_RE_NFA_SIG) {
// accepts a special, negative value to feed
good = c == nfa->sig;
} else if (nfa->kind == MG_RE_NFA_UCP && c >= 0) {
// see if we match the current character
good = nfa->ucp == c;
} else if (nfa->kind == MG_RE_NFA_ANY && c >= 0) {
// NOTE: assumes c < 256
good = nfa->any.any[c];
// apply inversion
if (nfa->any.inv) good = !good;
}
if (good) {
// add the states
if (mg_reit_add_(reit, nfa->u)) res = true;
if (mg_reit_add_(reit, nfa->v)) res = true;
}
}
}
return res;
}
/// MG CLI (i.e. you don't need to copy this into your project) ///
// exit with usage message
static void
e_usage(bool isbad) {
fprintf(stderr, "usage: mg [re] [src...=./]\n");
fprintf(stderr, "\n");
fprintf(stderr, " -h,--help prints this help/usage menu and exits\n");
fprintf(stderr, " -[DNLS] set the format columns ([Dd]ate, [Nn]ame, [Ll]ineno, [Ss]ource)\n");
fprintf(stderr, "\n");
fprintf(stderr, "see: https://cade.site/2021/09/28/diy-regex-engine\n");
fprintf(stderr, "author: Cade Brown <cade@cade.site>\n");
// exit the process
exit(isbad ? EXIT_FAILURE : EXIT_SUCCESS);
}
// exit with errno value
static void
e_errno(int errnoval, const char* msg) {
fprintf(stderr, "mg: %s: %s\n", msg, strerror(errnoval));
exit(EXIT_FAILURE);
}
// format match
static void
p_fmt(const char* fmt, const char* src, struct stat* sb, const char* line, int lineno) {
char c;
int ct = 0;
while ((c = *fmt) != '\0') {
fmt++;
if (c == 'L' || c == 'l') {
if (ct > 0) printf(" | ");
// lineno display
printf("%4i", lineno+1/*1-based*/);
} else if (c == 'S' || c == 's') {
if (ct > 0) printf(" | ");
// source display
// NOTE: we don't want newline
printf("%.*s", (int)(strlen(line) - 1), line);
} else if (c == 'N' || c == 'n') {
if (ct > 0) printf(" | ");
// source name
// NOTE: we don't want newline
printf("%-16s", src);
} else if (c == 'D' || c == 'd') {
if (ct > 0) printf(" | ");
// date name
char tmp[256];
// print in ISO8601
strftime(tmp, sizeof(tmp), "%FT%T", gmtime(&sb->st_mtime));
printf("%s", tmp);
}
ct++;
}
// finish with a newline
printf("\n");
//printf("%4i | %-12s | %s", lineno+1/* 1-based */, src, line);
}
// utility function to read an entire line of text into '*pline' (of length '*n'),
// automatically reallocating
// NOTE: returns the length of the line
static int
u_getline(FILE* fp, int* n, char** pline) {
if (!*pline) {
// need to allocate some space, so start with 1024
if ((*pline = malloc(*n = 1024)) == NULL) {
e_errno(errno, __func__);
}
}
// iterate over characters until we hit the end or a newline
char* pos = *pline;
int c;
while ((c = fgetc(fp)) != EOF && (*pos++ = c) != '\n') {
int off = pos - *pline;
if (off >= *n) {
// we need to reallocate
// this is a pretty good hueristic for most purposes
*n = off * 2 + 8;
if ((*pline = realloc(*pline, *n)) == NULL) {
e_errno(errno, __func__);
}
// recalculate the other values
pos = *pline + off;
}
}
if (pos != *pline && pos[-1] != '\n') {
// insert newline
*pos = '\n';
pos++;
}
// NUL-terminate
*pos = '\0';
return pos - *pline;
}
// perform search on 'src' file, with the given pattern and iter structure
static void
do_file(const char* fmt, const char* src, FILE* fp, struct stat* sb, mg_re re, struct mg_reit* reit, int* n, char** pline) {
// attempt to open 'fp'
if (!fp) {
fp = fopen(src, "r");
if (!fp) {
e_errno(errno, src);
}
}
// we need to reset our state
mg_reit_reset(reit);
// line number
int lineno = 0;
while (true) {
// get the next line within the file
int len = u_getline(fp, n, pline);
if (!len) {
// empty line means the end of file has been reached
break;
} else if (len < 0) {
// problem
e_errno(errno, src);
}
// whether or not the current line should be printed
bool shouldprint = false;
// we want to try to match at every position
if (mg_reit_addentry(reit)) {
shouldprint = true;
}
// simulate a linebegin
if (mg_reit_sig_linebegin(reit)) {
shouldprint = true;
}
// now, feed the line
int i, c;
for (i = 0; i < len; ++i) {
c = (*pline)[i];
if (c == '\n') break;
// we want to try to match at every position
if (mg_reit_addentry(reit)) {
shouldprint = true;
}
// add the character
if (mg_reit_next(reit, c)) {
// in a matching state, so print it out
shouldprint = true;
}
}
// we want to try to match at every position
if (mg_reit_addentry(reit)) {
shouldprint = true;
}
// simulate a lineend
if (mg_reit_sig_lineend(reit)) {
shouldprint = true;
}
// now, print out if we've matched
if (shouldprint) {
p_fmt(fmt, src, sb, *pline, lineno);
// for plain format
//printf("%s", *pline);
}
lineno++;
}
// done with the file
fclose(fp);
}
static void
do_src(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline);
// perform search for 'src' recursively
static void
do_dir(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline) {
DIR* dp = opendir(src);
if (!dp) {
e_errno(errno, src);
}
// temporary buffer
char tmp[PATH_MAX];
// iterate over children
struct dirent* de;
while ((de = readdir(dp)) != NULL) {
char* name = de->d_name;
if (strcmp(".", name) == 0 || strcmp("..", name) == 0) {
// ignore
} else {
// generate name and execute it
snprintf(tmp, PATH_MAX, "%s/%s", src, name);
do_src(fmt, tmp, re, reit, n, pline);
}
}
// done
closedir(dp);
}
// perform generic search
static void
do_src(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline) {
if (strcmp(src, "-") == 0) { // stdin, special case
struct stat sb;
if (fstat(STDIN_FILENO, &sb) < 0) {
e_errno(errno, src);
}
do_file(fmt, src, stdin, &sb, re, reit, n, pline);
return;
}
// check what 'src' is
struct stat sb;
if (stat(src, &sb) < 0) {
e_errno(errno, src);
}
if (S_ISREG(sb.st_mode)) {
// regular file
do_file(fmt, src, NULL, &sb, re, reit, n, pline);
} else {
// directory
do_dir(fmt, src, re, reit, n, pline);
}
}
// default list of sources to search
static const char* default_src[] = {
"-",
};
// default format (name, lineno, src)
static const char* default_fmt = "NLS";
int main(int argc, const char** argv) {
// requires arguments
if (argc < 2) e_usage(true);
// now, pull off the expr
const char* expr = argv[1];
const char* fmt = default_fmt;
int i, j;
for (j = i = 1; i < argc; ++i) {
if (strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[1]) == 0) {
e_usage(false);
} else if (argv[i][0] == '-' && argv[i][1] != '\0') {
// claim next as format
fmt = argv[i] + 1;
// shortcuts
if (strcmp("d", fmt) == 0 || strcmp("D", fmt) == 0) {
fmt = "DNLS";
}
} else {
// copy in and compact
argv[j++] = argv[i];
}
}
// account for arguments
argc = j;
// compile the expression
mg_re re = mg_re_make(expr);
if (!re) {
// error should have been reported
return EXIT_FAILURE;
}
// regex iterator, from our pattern
struct mg_reit reit;
mg_reit_init(&reit, re);
// try to use the arguments passed in
int nsrc = argc - 2;
const char** src = argv + 2;
if (nsrc == 0) {
// none were given, so use the default sources:
nsrc = sizeof(default_src) / sizeof(*default_src);
src = default_src;
}
// for u_getline()
int n = 0;
char* pline = NULL;
for (i = 0; i < nsrc; ++i) {
// perform generic search
do_src(fmt, src[i], re, &reit, &n, &pline);
}
// you don't really need to at the end of 'main()', but its a good habit
// to free your resources
mg_re_free(re);
}
a
aa
aaa
aaaa
aaaaa
b
c
ab
ac
bc
abc
aaabbb
cat
bat
rat
mat
map
trap
crap
back
wack
stack
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment