cadebrown/mg.c

## mg.c
/* mg.c - my-grep implementation, portable regex and search implementation in pure C
 *
 * should build on UNIX/Linux-like OSes no problem, should also work on Windows Subsystem for Linux (WSL)
 *
 * regex syntax: read the second comment block in this file
 *
 * compile like:
 * $ cc mg.c -o mg
 *
 * then you can run like:
 *
 * $ ./mg 'a+b*c?d' file.txt other.txt
 * ```
 * file.txt         |    8 | abcd
 * file.txt         |    9 | ad
 * file.txt         |   10 | acd
 * ```
 *
 * You can view help/usage:
 * $ ./mg -h
 * ```
 * usage: mg [re] [src...=./]
 *
 *   -h,--help            prints this help/usage menu and exits
 *   -[DNLS]              set the format columns ([Dd]ate, [Nn]ame, [Ll]ineno, [Ss]ource)
 *
 * see: https://cade.site/2021/09/28/diy-regex-engine
 * author: Cade Brown <cade@cade.site>
 * ```
 *
 * you can also customize the format
 *
 *
 *
 * if you want to embed the regex library, you can see where it is split from library and CLI source code...
 *   right now it uses 'assert()' statements and hard exits on errors... obviously you'll have to inject
 *   your own error recovery/reporting
 *
 * use where-ever, but give me credit and email me a link to it
 *
 * @author: Cade Brown <cade@cade.site>
 */

/*** REGEX SYNTAX
 *
 * here's the regex syntax:
 *
 *   * a regex is made up of 0-or-more terms, connected by '|' (ior/pipe, which matches any of the terms)
 *   * a term is made up of one or more factors, connected by implicit concatenation
 *   * a factor is made up of:
 *     * a single character/escaped character
 *     * a special meta character:
 *       * '^': start of line
 *       * '$': end of line
 *       * '\s': any space character
 *       * '\d': any dogot
 *       * ... etc
 *     * a set/any (denoted like `[]`, for example `[a-zA-Z0-9_]`)
 *     * a parentheses, with an entire regex within it (for example, `(a+b)` is a single factor)
 *   * a factor may be followed by:
 *     * '?': this factor is optional to match (match 0 or 1 times)
 *     * '*': this factor can be matched 0-or-more times
 *     * '+': this factor can be matched 1-or-more times
 *     * '**{a,b,c:d}': this factor should be matched exactly a, b, or any number between and including c,d
 *       * for example, '**{2,5,8:10}' matches 2, 5, 8, 9, or 10 times
 *
 ***/


/// INCLUDES ///

// cstd
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>

// for STDIN_FILENO
#include <unistd.h>


// for assert()
#include <assert.h>

// for errno
#include <errno.h>

// for strcmp()
#include <string.h>

// for opendir()
#include <dirent.h>

// for stat()
#include <sys/stat.h>

// for strftime()
#include <time.h>


// define it ourself
#ifndef PATH_MAX
  #define PATH_MAX 4096
#endif


/// CONSTANTS ///

enum {

    // epsilon transitions to {u,v}
    MG_RE_NFA_EPS,

    // match a signal character, which is a negative character
    MG_RE_NFA_SIG,

    // match a UCP character
    // (no unicode for this demo, left as an exercise to the reader)
    MG_RE_NFA_UCP,

    // match a set of UCP characters
    // see MG_RE_UCP
    MG_RE_NFA_ANY,

};


/// TYPES ///

// represents a single node within an NFA graph structure
// SEE: struct mg_re, which these are contained in
// NFA: https://en.wikipedia.org/wiki/Nondeterministic_finite_automaton
struct mg_re_nfa {

    // the kind of NFA node (see 'MG_RE_NFA_*')
    int kind;

    // outward edges on the NFA graph, or one of the special values:
    //   -1: empty/no edge
    //   -2: open edge that is unlinked
    int u, v;

    // tagged union, depending on 'kind'
    union {
        // character to match
        // IF: 'kind==MG_RE_NFA_UCP'
        int ucp;

        // signal to match
        // IF: 'kind==MG_RE_NFA_SIG'
        int sig;

        // structure describing a set
        // IF: 'kind==MG_RE_NFA_ANY'
        struct {

            // whether or not the output is normal (false) or inverted (true)
            // if inverted,
            bool inv;

            // bit-set of whether a given 8-bit character, assumes all are 8-bit
            //   or less
            bool any[256];

            // TODO: if you want to implement unicode characters, somehow
            //         express that here

        } any;
    };
};

// represents a regular expression pattern, which is basically an NFA regex
typedef struct mg_re_s {

    // the string it was generated from (for error diagonstics)
    char* expr;

    // number of NFA states (in 'nfa')
    int nfa_len;

    // array of NFA nodes
    struct mg_re_nfa* nfa;

    // the entry index into nfa
    int entry;

}* mg_re;

// create a new regex from an expression
// NOTE: 'expr' should be NUL-terminated (and probably UTF-8, if
//         you support unicode)
mg_re
mg_re_make(const char* expr);

// free the regular expression structure
void
mg_re_free(mg_re re);


// default iterator structure for a regular expression
struct mg_reit {

    // regex we are iterating over
    // NOTE: this should not change, once the structure is initialized!
    mg_re re;

    // a map of whether we're in the index, for example 'in[2]' means 're->nfa[2]'
    //   is an active state we are traversing
    bool* in;

    // temporary buffer, that is hot-swapped with 'in', used while
    //   traversing the NFA
    bool* lastin;

};

// initialize a regex iterator, from a regular expression
// NOTE: make sure to call 'mg_reit_free()' to clean up memory
void
mg_reit_init(struct mg_reit* reit, mg_re re);

// free any space used by the regex iterator
void
mg_reit_free(struct mg_reit* reit);

// reset to an empty state, without freeing any memory
void
mg_reit_reset(struct mg_reit* reit);


// simulate a linebegin event
bool
mg_reit_sig_linebegin(struct mg_reit* reit);

// simulate a lineend event
bool
mg_reit_sig_lineend(struct mg_reit* reit);

// add th entry state, return whether in a matching state
bool
mg_reit_addentry(struct mg_reit* reit);

// advance the iterator to the next character
// NOTE: returns whether 'reit' is in a matching state
bool
mg_reit_next(struct mg_reit* reit, int c);


/// MG IMPLEMENTATION ///
// NOTE: you could keep this part in a seperate file, and put the above in
//         a header
// for example, if you did that to 'mg.h', you would uncomment the following line:
//#include <mg.h>


// mg_re impl //


// utility macro for position shorthand
#define POS   (*pos)

// internal routine to push a new NFA with a given kind on the end of the regular expression
// NOTE: returns a pointer to the newly added element
static struct mg_re_nfa*
mg_re_push_(mg_re re, int kind) {

    // reallocate array
    int i = re->nfa_len++;
    re->nfa = realloc(re->nfa, sizeof(*re->nfa) * re->nfa_len);

    re->nfa[i].kind = kind;

    // start out empty
    re->nfa[i].u = -2;
    re->nfa[i].v = -1;

    return &re->nfa[i];
}

// internal routine to replace a->b as edges within [start, stop)
static void
mg_re_link_(mg_re re, int start, int stop, int a, int b) {

    // loop over selected range
    int i;
    for (i = start; i < stop; ++i) {
        struct mg_re_nfa* nfa = &re->nfa[i];

        // replace a -> b
        if (nfa->u == a) nfa->u = b;
        if (nfa->v == a) nfa->v = b;
    }
}

// internal replace-range function, for duplast
static int
mg_re_rr_(int x, int start, int stop, int diff) {
    if (start <= x && x < stop) {
        x += diff;
    }
    return x;
}


// internal routine to duplicate the range to the end
static void
mg_re_duprange_(mg_re re, int start, int stop) {

    // reallocate array
    int i = re->nfa_len;
    re->nfa_len += stop - start;
    re->nfa = realloc(re->nfa, sizeof(*re->nfa) * re->nfa_len);

    // difference to add
    int diff = i - start;

    // copy over
    memcpy(re->nfa + i, re->nfa + start, (stop - start) * sizeof(*re->nfa));

    // link all existing to the second one
    //mg_re_link_(re, start, stop, -2, i);

    // now, translate range
    int j;
    for (j = i; j < re->nfa_len; ++j) {
        re->nfa[j].u = mg_re_rr_(re->nfa[j].u, start, stop, diff);
        re->nfa[j].v = mg_re_rr_(re->nfa[j].v, start, stop, diff);
    }

}

// partition
static int
mg_re_sorti_part_(int n, int* x) {

    // pivot value
    int piv = x[0];

    int i = -1, j, t;
    for (j = 0; j < n; ++j) {
        if (x[j] < piv) {
            ++i;
            t = x[i];
            x[i] = x[j];
            x[j] = t;
        }
    }


    // swap
    t = x[i+1];
    x[i+1] = x[n-1];
    x[n - 1] = t;

    return i + 1;
}

// sort, in-place 'x[0:n]'
static void
mg_re_sorti_(int n, int* x) {
    // already sorted
    if (n < 2) return;

    int ipiv = mg_re_sorti_part_(n, x);

    mg_re_sorti_(ipiv - 1, x);
    mg_re_sorti_(n - ipiv, x + ipiv);

}

static int
mg_re_parse_(mg_re re, const char** pos, int lsl);


// internal routine to parse a factor and return the entry position, or -1 if an error
static int
mg_re_parse_factor_(mg_re re, const char** pos, int lsl) {

    // starting length of the NFA, which is the node associate with the primary characte
    int sl = re->nfa_len;

    // result which is the entry position
    int res = sl;

    int c;
    if (*POS == '(') {
        POS++;

        // parse entire structure
        int e = mg_re_parse_(re, pos, lsl);
        if (e < 0) return -1;

        // result to entry
        res = e;

        // assume end
        assert(*POS == ')');
        POS++;

    } else if (*POS == '[') {
        // set
        POS++;

        struct mg_re_nfa* nfa = mg_re_push_(re, MG_RE_NFA_ANY);

        // '!' leading inverts the selection
        nfa->any.inv = *POS == '!';
        if (nfa->any.inv) POS++;

        while (*POS && *POS != ']') {
            c = *POS;
            POS++;
            if (c == '\\') {
                // escape code
                c = *POS;
                POS++;
                if (c == '\\') {
                    nfa->any.any['\\'] = true;
                } else if (c == 'n') {
                    nfa->any.any['\n'] = true;
                } else if (c == 't') {
                    nfa->any.any['\t'] = true;
                } else {
                    nfa->any.any[c] = true;
                }
            } else if (*POS == '-') {
                // range specifier
                POS++;
                int cto = *POS;
                POS++;
                int i;
                for (i = c; i <= cto; ++i) {
                    nfa->any.any[i] = true;
                }
            } else {
                // just assume valid character
                nfa->any.any[c] = true;
            }
        }

        // assume end
        assert(*POS == ']');
        POS++;

    } else if (*POS == '^') {
        POS++;

        // linebegin magic number
        mg_re_push_(re, MG_RE_NFA_SIG)->sig = -1;

    } else if (*POS == '$') {
        POS++;

        // lineend magic number
        mg_re_push_(re, MG_RE_NFA_SIG)->sig = -2;

    } else {
        c = *POS;

        POS++;
        mg_re_push_(re, MG_RE_NFA_UCP)->ucp = c;
    }

    // check modifier
    if (*POS == '?') {
        POS++;

        // repeat [0, 1] times, badding an eps transition
        int sl_ques = re->nfa_len;
        mg_re_push_(re, MG_RE_NFA_EPS)->v = sl;
        re->nfa[sl_ques].v = sl;

        // don't link anything
        res = sl_ques;
    } else if (*POS == '+') {
        POS++;

        // repeat [1, inf) times, by adding an epsilon transition back to self
        int sl_plus = re->nfa_len;
        mg_re_push_(re, MG_RE_NFA_EPS)->v = sl;
        re->nfa[sl_plus].v = sl;

        // link all previous to 'sl_plus'
        mg_re_link_(re, lsl, sl_plus, -2, sl_plus);

    } else if (*POS == '*' && POS[1] == '*') {
        POS++;
        POS++;

        assert(*POS == '{');
        POS++;

        // n optional
        int nopt = 0;
        // optional iteration counts (NOTE: need to be stored)
        int* opt = NULL;

        while (*POS != '}') {
            if (nopt > 0) {
                assert(*POS == ',');
                POS++;
            }

            int x = 0;

            // take characters
            c = *POS;
            assert('0' <= c && c <= '9');
            do {
                POS++;

                // advance count
                x = 10 * x + (c - '0');

                c = *POS;
            } while ('0' <= c && c <= '9');

            // reallocation optional counts
            int i = nopt++;
            opt = realloc(opt, sizeof(*opt) * nopt);

            opt[i] = x;

            if (*POS == ':') {
                POS++;
                int y = 0;

                // take characters
                c = *POS;
                assert('0' <= c && c <= '9');
                do {
                    POS++;

                    // advance count
                    y = 10 * y + (c - '0');

                    c = *POS;
                } while ('0' <= c && c <= '9');

                // add a bunch more optional ones
                int off = nopt;
                nopt += y - x - 1;
                opt = realloc(opt, sizeof(*opt) * nopt);

                for (i = 0; i < y - x - 1; ++i) {
                    opt[off + i] = x + i + 1;
                }
            }
        }

        // sort that
        mg_re_sorti_(nopt, opt);

        // expect end
        assert(*POS == '}');
        POS++;

        // number to repeat
        int num_rep = re->nfa_len - sl;

        // the position to repeat from
        // NOTE: this changes each time
        int pos_rep = sl;

        // how many are required so far
        int sofar = 1;

        int lsl_opt = -1;

        // now, sort in place
        int i;
        for (i = 0; i < nopt; ++i) {

            int sl_opt = -1;
            if (i > 0) {
                // add optional component
                sl_opt = re->nfa_len;
                mg_re_push_(re, MG_RE_NFA_EPS)->v = sl_opt + 1;
                // link to the optional
                mg_re_link_(re, pos_rep, pos_rep + num_rep, -2, sl_opt);
            }

            int j;
            for (j = sofar; j < opt[i]; ++j) {
                int sl_cur = re->nfa_len;

                // repeat it
                mg_re_duprange_(re, pos_rep, pos_rep + num_rep);

                if (i == 0 || j != sofar) mg_re_link_(re, pos_rep, pos_rep + num_rep, -2, (res - sl) + sl_cur);
                else mg_re_link_(re, sl_cur, sl_cur + num_rep, sl_opt, -2);

                // repeat from this location
                pos_rep = sl_cur;
            }

            // save state for next time
            sofar = opt[i];
        }

    } else if (*POS == '*') {
        POS++;

        // repeat [0, inf) times, by adding an epsilon transition back to self
        // length of the star node
        int sl_star = re->nfa_len;
        mg_re_push_(re, MG_RE_NFA_EPS)->v = sl;

        // link all previous to 'sl_star'
        mg_re_link_(re, lsl, sl_star, -2, sl_star);

        // now, we need to enter at the star
        res = sl_star;

    } else {
        // just character match, so we only added 1 state
        mg_re_link_(re, lsl, sl, -2, sl);
    }

    // return entry point
    return res;
}

// internal routine to parse a term, and return the entry position, or -1 if an error
static int
mg_re_parse_term_(mg_re re, const char** pos, int lsl) {

    // entry point
    int res = re->nfa_len;

    int ct = 0;
    while (*POS && *POS != '|' && *POS != ')') {

        // starting length
        int sl = re->nfa_len;

        // parse another factor, get entry point
        int e = mg_re_parse_factor_(re, pos, sl);
        if (e < 0) return e;

        if (ct > 0) {
            // link all previous to the entry point
            mg_re_link_(re, lsl, sl, -2, e);
        } else {
            // return entry point
            res = e;
        }

        // restart now
        lsl = sl;
        ct++;
    }

    return res;
}


// internal routine to parse an entire routine, and return entry point
static int
mg_re_parse_(mg_re re, const char** pos, int lsl) {
    int res = -1;
    int ct = 0;

    while (*POS && *POS != ')') {
        int sl = re->nfa_len;
        // capture
        int ssl = sl;

        while (*POS && *POS != '|' && *POS != ')') {
            int sl_cur = re->nfa_len;
            // create a bunch of terms
            int e = mg_re_parse_term_(re, pos, sl_cur);
            if (e < 0) return e;
            if (res < 0) res = e;

            // link previous to this entry point
            mg_re_link_(re, ssl, sl_cur, -2, e);

            ssl = sl_cur;
        }

        if (ct > 0) {
            // add an epsilon that is the result
            int sl_opt = re->nfa_len;
            struct mg_re_nfa* nfa = mg_re_push_(re, MG_RE_NFA_EPS);

            // link back to the previous
            nfa->u = sl;
            nfa->v = lsl;

            res = sl_opt;
        }

        // end here
        ct++;
        if (*POS != '|') break;

        // advance past it
        assert(*POS == '|');
        POS++;

        lsl = sl;
    }
    if (res < 0) res = 0;
    return res;
}


mg_re
mg_re_make(const char* expr) {
    mg_re re = malloc(sizeof(*re));
    assert(re != NULL);

    // copy expression
    int sl = strlen(expr);
    re->expr = malloc(sl + 1);
    assert(re->expr != NULL);
    strcpy(re->expr, expr);

    // start with an empty graph
    re->nfa_len = 0;
    re->nfa = NULL;

    // parse in-place
    re->entry = mg_re_parse_(re, &expr, 0);
    if (re->entry < 0) {
        mg_re_free(re);
        return NULL;
    }

    //printf("[%i], entry: %i\n", re->nfa_len, re->entry);
    //printf("%i,%c: %i, %i\n", re->nfa[0].kind, re->nfa[0].ucp, re->nfa[0].u, re->nfa[0].v);
    //printf("%i, %i\n", re->nfa[1].u, re->nfa[1].v);
    //printf("%i, %i\n", re->nfa[2].u, re->nfa[2].v);
    //printf("%i, %i\n", re->nfa[3].u, re->nfa[3].v);

    return re;
}

void
mg_re_free(mg_re re) {

    free(re->expr);
    free(re->nfa);

    free(re);
}


// mg_reit //

void
mg_reit_init(struct mg_reit* reit, mg_re re) {
    reit->re = re;
    reit->in = malloc(sizeof(*reit->in) * reit->re->nfa_len);
    assert(reit->in != NULL);
    reit->lastin = malloc(sizeof(*reit->lastin) * reit->re->nfa_len);
    assert(reit->lastin != NULL);

    // reset data
    mg_reit_reset(reit);
}

void
mg_reit_free(struct mg_reit* reit) {
    free(reit->in);
    free(reit->lastin);
}

void
mg_reit_reset(struct mg_reit* reit) {
    int i;
    for (i = 0; i < reit->re->nfa_len; ++i) {
        reit->lastin[i] = reit->in[i] = false;
    }
}


// internal routine to add a state (set bm[i]), possibly following other transitions
// NOTE: returns whether we are in a matching state
static bool
mg_reit_add_(struct mg_reit* reit,  int s) {
    if (s == -1) {
        // -1: empty/nothing further
        return false;
    } else if (s == -2) {
        // -2: accept/match
        return true;
    } else {
        // should have handled all special values
        assert(s >= 0);
        // should be a valid index
        assert(s < reit->re->nfa_len);

        // add children (which may be empty/invalid)
        bool res = false;
        if (reit->re->nfa[s].kind == MG_RE_NFA_EPS) {
            // epsilon, so add children instead
            if (mg_reit_add_(reit, reit->re->nfa[s].u)) res = true;
            if (mg_reit_add_(reit, reit->re->nfa[s].v)) res = true;
        } else {
            // not epsilon, so add transition directly
            reit->in[s] = true;
        }

        return res;
    }
}

bool
mg_reit_sig_linebegin(struct mg_reit* reit) {
    return mg_reit_next(reit, -1 /* magic value */);
}

bool
mg_reit_sig_lineend(struct mg_reit* reit) {
    return mg_reit_next(reit, -2 /* magic value */);
}

bool
mg_reit_addentry(struct mg_reit* reit) {
    // match empty
    if (reit->re->nfa_len <= 0) return true;
    return mg_reit_add_(reit, reit->re->entry);
}


bool
mg_reit_next(struct mg_reit* reit, int c) {
    // NOTE: no unicode support here!
    assert(c < 256);

    // swap the buffers, since we are doing the next iteration
    bool* intmp = reit->in;
    reit->in = reit->lastin;
    reit->lastin = intmp;

    // reset the current input
    int i;
    for (i = 0; i < reit->re->nfa_len; ++i) {
        reit->in[i] = false;
    }

    // what to return, whether we are in an accepting state
    bool res = false;

    // just normal character
    for (i = 0; i < reit->re->nfa_len; ++i) {
        if (reit->lastin[i]) {
            // we are in the current state
            struct mg_re_nfa* nfa = &reit->re->nfa[i];

            bool good = false;

            if (nfa->kind == MG_RE_NFA_SIG) {
                // accepts a special, negative value to feed
                good = c == nfa->sig;
            } else if (nfa->kind == MG_RE_NFA_UCP && c >= 0) {
                // see if we match the current character
                good = nfa->ucp == c;
            } else if (nfa->kind == MG_RE_NFA_ANY && c >= 0) {
                // NOTE: assumes c < 256
                good = nfa->any.any[c];
                // apply inversion
                if (nfa->any.inv) good = !good;
            }

            if (good) {
                // add the states
                if (mg_reit_add_(reit, nfa->u)) res = true;
                if (mg_reit_add_(reit, nfa->v)) res = true;
            }
        }
    }

    return res;
}


/// MG CLI (i.e. you don't need to copy this into your project) ///

// exit with usage message
static void
e_usage(bool isbad) {
    fprintf(stderr, "usage: mg [re] [src...=./]\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  -h,--help            prints this help/usage menu and exits\n");
    fprintf(stderr, "  -[DNLS]              set the format columns ([Dd]ate, [Nn]ame, [Ll]ineno, [Ss]ource)\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "see: https://cade.site/2021/09/28/diy-regex-engine\n");
    fprintf(stderr, "author: Cade Brown <cade@cade.site>\n");

    // exit the process
    exit(isbad ? EXIT_FAILURE : EXIT_SUCCESS);
}

// exit with errno value
static void
e_errno(int errnoval, const char* msg) {
    fprintf(stderr, "mg: %s: %s\n", msg, strerror(errnoval));
    exit(EXIT_FAILURE);
}


// format match
static void
p_fmt(const char* fmt, const char* src, struct stat* sb, const char* line, int lineno) {
    char c;
    int ct = 0;
    while ((c = *fmt) != '\0') {
        fmt++;

        if (c == 'L' || c == 'l') {
            if (ct > 0) printf(" | ");
            // lineno display
            printf("%4i", lineno+1/*1-based*/);
        } else if (c == 'S' || c == 's') {
            if (ct > 0) printf(" | ");
            // source display
            // NOTE: we don't want newline
            printf("%.*s", (int)(strlen(line) - 1), line);
        } else if (c == 'N' || c == 'n') {
            if (ct > 0) printf(" | ");
            // source name
            // NOTE: we don't want newline
            printf("%-16s", src);
        } else if (c == 'D' || c == 'd') {
            if (ct > 0) printf(" | ");
            // date name
            char tmp[256];
            // print in ISO8601
            strftime(tmp, sizeof(tmp), "%FT%T", gmtime(&sb->st_mtime));
            printf("%s", tmp);
        }

        ct++;
    }

    // finish with a newline
    printf("\n");

    //printf("%4i | %-12s | %s", lineno+1/* 1-based */, src, line);
}


// utility function to read an entire line of text into '*pline' (of length '*n'),
//   automatically reallocating
// NOTE: returns the length of the line
static int
u_getline(FILE* fp, int* n, char** pline) {
    if (!*pline) {
        // need to allocate some space, so start with 1024
        if ((*pline = malloc(*n = 1024)) == NULL) {
            e_errno(errno, __func__);
        }
    }

    // iterate over characters until we hit the end or a newline
    char* pos = *pline;
    int c;
    while ((c = fgetc(fp)) != EOF && (*pos++ = c) != '\n') {
        int off = pos - *pline;
        if (off >= *n) {
            // we need to reallocate
            // this is a pretty good hueristic for most purposes
            *n = off * 2 + 8;

            if ((*pline = realloc(*pline, *n)) == NULL) {
                e_errno(errno, __func__);
            }

            // recalculate the other values
            pos = *pline + off;
        }
    }


    if (pos != *pline && pos[-1] != '\n') {
        // insert newline
        *pos = '\n';
        pos++;
    }

    // NUL-terminate
    *pos = '\0';

    return pos - *pline;
}

// perform search on 'src' file, with the given pattern and iter structure
static void
do_file(const char* fmt, const char* src, FILE* fp, struct stat* sb, mg_re re, struct mg_reit* reit, int* n, char** pline) {
    // attempt to open 'fp'
    if (!fp) {
        fp = fopen(src, "r");
        if (!fp) {
            e_errno(errno, src);
        }
    }


    // we need to reset our state
    mg_reit_reset(reit);

    // line number
    int lineno = 0;

    while (true) {
        // get the next line within the file
        int len = u_getline(fp, n, pline);
        if (!len) {
            // empty line means the end of file has been reached
            break;
        } else if (len < 0) {
            // problem
            e_errno(errno, src);
        }

        // whether or not the current line should be printed
        bool shouldprint = false;

        // we want to try to match at every position
        if (mg_reit_addentry(reit)) {
            shouldprint = true;
        }

        // simulate a linebegin
        if (mg_reit_sig_linebegin(reit)) {
            shouldprint = true;
        }

        // now, feed the line
        int i, c;
        for (i = 0; i < len; ++i) {
            c = (*pline)[i];
            if (c == '\n') break;

            // we want to try to match at every position
            if (mg_reit_addentry(reit)) {
                shouldprint = true;
            }

            // add the character
            if (mg_reit_next(reit, c)) {
                // in a matching state, so print it out
                shouldprint = true;
            }
        }

        // we want to try to match at every position
        if (mg_reit_addentry(reit)) {
            shouldprint = true;
        }

        // simulate a lineend
        if (mg_reit_sig_lineend(reit)) {
            shouldprint = true;
        }

        // now, print out if we've matched
        if (shouldprint) {
            p_fmt(fmt, src, sb, *pline, lineno);
            // for plain format
            //printf("%s", *pline);
        }

        lineno++;
    }

    // done with the file
    fclose(fp);

}

static void
do_src(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline);


// perform search for 'src' recursively
static void
do_dir(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline) {
    DIR* dp = opendir(src);
    if (!dp) {
        e_errno(errno, src);
    }
    // temporary buffer
    char tmp[PATH_MAX];

    // iterate over children
    struct dirent* de;
    while ((de = readdir(dp)) != NULL) {
        char* name = de->d_name;
        if (strcmp(".", name) == 0 || strcmp("..", name) == 0) {
            // ignore
        } else {
            // generate name and execute it
            snprintf(tmp, PATH_MAX, "%s/%s", src, name);
            do_src(fmt, tmp, re, reit, n, pline);
        }
    }

    // done
    closedir(dp);
}


// perform generic search
static void
do_src(const char* fmt, const char* src, mg_re re, struct mg_reit* reit, int* n, char** pline) {
    if (strcmp(src, "-") == 0) {        // stdin, special case
        struct stat sb;
        if (fstat(STDIN_FILENO, &sb) < 0) {
            e_errno(errno, src);
        }
        do_file(fmt, src, stdin, &sb, re, reit, n, pline);
        return;
    }
    // check what 'src' is
    struct stat sb;
    if (stat(src, &sb) < 0) {
        e_errno(errno, src);
    }

    if (S_ISREG(sb.st_mode)) {
        // regular file
        do_file(fmt, src, NULL, &sb, re, reit, n, pline);
    } else {
        // directory
        do_dir(fmt, src, re, reit, n, pline);
    }

}

// default list of sources to search
static const char* default_src[] = {
    "-",
};

// default format (name, lineno, src)
static const char* default_fmt = "NLS";

int main(int argc, const char** argv) {
    // requires arguments
    if (argc < 2) e_usage(true);

    // now, pull off the expr
    const char* expr = argv[1];

    const char* fmt = default_fmt;

    int i, j;
    for (j = i = 1; i < argc; ++i) {
        if (strcmp("-h", argv[i]) == 0 || strcmp("--help", argv[1]) == 0) {
            e_usage(false);
        } else if (argv[i][0] == '-' && argv[i][1] != '\0') {
            // claim next as format
            fmt = argv[i] + 1;
            // shortcuts
            if (strcmp("d", fmt) == 0 || strcmp("D", fmt) == 0) {
                fmt = "DNLS";
            }
        } else {
            // copy in and compact
            argv[j++] = argv[i];
        }
    }

    // account for arguments
    argc = j;


    // compile the expression
    mg_re re = mg_re_make(expr);
    if (!re) {
        // error should have been reported
        return EXIT_FAILURE;
    }

    // regex iterator, from our pattern
    struct mg_reit reit;
    mg_reit_init(&reit, re);


    // try to use the arguments passed in
    int nsrc = argc - 2;
    const char** src = argv + 2;

    if (nsrc == 0) {
        // none were given, so use the default sources:
        nsrc = sizeof(default_src) / sizeof(*default_src);
        src = default_src;
    }

    // for u_getline()
    int n = 0;
    char* pline = NULL;

    for (i = 0; i < nsrc; ++i) {

        // perform generic search
        do_src(fmt, src[i], re, &reit, &n, &pline);

    }

    // you don't really need to at the end of 'main()', but its a good habit
    //   to free your resources
    mg_re_free(re);

}


## mgtest.txt

a
aa
aaa
aaaa
aaaaa
b
c
ab
ac
bc
abc
aaabbb
cat
bat
rat
mat
map
trap
crap
back
wack
stack

	a
	aa
	aaa
	aaaa
	aaaaa
	b
	c
	ab
	ac
	bc
	abc
	aaabbb
	cat
	bat
	rat
	mat
	map
	trap
	crap
	back
	wack
	stack