lsem/phresals_finder.cpp

## phresals_finder.cpp
#include <algorithm>
#include <cassert>
#include <fstream>
#include <functional>
#include <iostream>
#include <list>
#include <map>
#include <set>
#include <string>
#include <vector>

// Make all asserts runtime.
#ifdef _NDEBUG
#undef _NDEBUG
#endif

static const char* const WHITESPACES = "\t \n\0";
static const char* const DELIMITERS = ",.;?!";

namespace
{
bool g_verbose = true;
#define verbose_msg( msg, ... )                           \
    do                                                    \
    {                                                     \
        if ( g_verbose )                                  \
            std::printf( "VRB: " msg "\n", __VA_ARGS__ ); \
    } while ( false )
#define err_msg( msg, ... ) std::fprintf( stderr, "ERR: " msg "\n", __VA_ARGS__ );
#define warn_msg( msg, ... ) std::printf( "WARN: " msg "\n", __VA_ARGS__ );
#define info_msg( msg, ... ) std::printf( "" msg "\n", __VA_ARGS__ );
}

#define test_assert( Condition )                                                             \
    do                                                                                       \
    {                                                                                        \
        if ( !( Condition ) )                                                                \
        {                                                                                    \
            std::printf( "test condition failed: `%s` at line %d\n", #Condition, __LINE__ ); \
            return false;                                                                    \
        }                                                                                    \
    } while ( false );

#define test_eq( Expected, Actual )                                                                \
    do                                                                                             \
    {                                                                                              \
        auto expected = Expected;                                                                  \
        auto actual = Actual;                                                                      \
        if ( expected != actual )                                                                  \
        {                                                                                          \
            std::printf( "test condition failed: Expected %s (%s) != Actual %s (%s) at line %d\n", \
                         #Expected, std::to_string( expected ).c_str( ), #Actual,                  \
                         std::to_string( actual ).c_str( ), __LINE__ );                            \
            return false;                                                                          \
        }                                                                                          \
    } while ( false );

#define DISABLED_TEST_CASES ;

namespace phresal_matcher_lib
{
class TextTokenizer
{
public:
    enum class TokenType
    {
        Regular,
        Punctuation,
        End
    };

    struct Token
    {
        TokenType type;
        std::string data;
    };

public:
    explicit TextTokenizer( const std::string& text,
                            std::string whitespaces_chars,
                            std::string punctuation_mark_chars )
    {
        // place \n as terminator which is delimiter to handle last token.
        this->text = text + "\n";
        this->offset = 0;
        this->whitespaces_chars = std::move( whitespaces_chars );
        this->punctuation_mark_chars = std::move( punctuation_mark_chars );
    }

    void
    skip_whitespaces( )
    {
        this->offset = this->text.find_first_not_of( this->whitespaces_chars, this->offset );
    }

    void
    skip_punctuation_marks( )
    {
        this->offset = this->text.find_first_not_of( this->punctuation_mark_chars, this->offset );
    }

    Token
    next_token( )
    {
        if ( !returned_tokens.empty( ) )
        {
            auto result = returned_tokens.back( );
            returned_tokens.pop_back( );
            return result;
        }

        // find next token start position
        skip_whitespaces( );

        if ( !end( ) )
        {
            if ( this->punctuation_mark_chars.find( this->text[ this->offset ] )
                 != std::string::npos )
            {
                std::string tok;
                tok += this->text[ this->offset ];
                this->offset += 1;
                return {TokenType::Punctuation, tok};
            }
        }

        if ( !end( ) )
        {
            size_t token_end_pos
                = this->text.find_first_of( TextTokenizer::separators( ), this->offset );
            if ( token_end_pos != std::string::npos )
            {
                std::string token( this->text, this->offset, token_end_pos - this->offset );
                this->offset = token_end_pos;
                verbose_msg( "yield(%s)", token.c_str( ) );
                return {TokenType::Regular, token};
            }
        }
        return {TokenType::End, ""};
    }

    void
    put_token_back( Token token )
    {
        returned_tokens.push_back( std::move( token ) );
    }

    bool
    end( ) const
    {
        return this->offset == this->text.size( ) || this->offset == std::string::npos;
    }

    const std::string&
    separators( ) const
    {
        static std::string separators_chars = whitespaces_chars + punctuation_mark_chars;
        return separators_chars;
    }

private:
    std::list< Token > returned_tokens;
    std::string whitespaces_chars, punctuation_mark_chars;
    std::string text;
    size_t offset;
};

bool
tokenizer_basic_test( )
{
    TextTokenizer tokenizer( " Quick brown fox, jumps over the lazy dog!", WHITESPACES,
                             DELIMITERS );

    test_assert( tokenizer.end( ) == false );

    {
        auto t = tokenizer.next_token( );
        test_assert( t.data == "Quick" );
        test_assert( t.type == TextTokenizer::TokenType::Regular );
        test_assert( !tokenizer.end( ) );
    }

    test_assert( tokenizer.next_token( ).data == "brown" );
    test_assert( !tokenizer.end( ) );
    test_assert( tokenizer.next_token( ).data == "fox" );
    test_assert( !tokenizer.end( ) );

    {
        auto t = tokenizer.next_token( );
        test_assert( t.data == "," );
        test_assert( t.type == TextTokenizer::TokenType::Punctuation );
        test_assert( !tokenizer.end( ) );
    }

    test_assert( tokenizer.next_token( ).data == "jumps" );
    test_assert( !tokenizer.end( ) );

    test_assert( tokenizer.next_token( ).data == "over" );
    test_assert( !tokenizer.end( ) );
    test_assert( tokenizer.next_token( ).data == "the" );
    test_assert( !tokenizer.end( ) );

    tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "the"} );
    tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "over"} );
    test_assert( tokenizer.next_token( ).data == "over" );
    test_assert( !tokenizer.end( ) );
    test_assert( tokenizer.next_token( ).data == "the" );
    test_assert( !tokenizer.end( ) );

    test_assert( tokenizer.next_token( ).data == "lazy" );
    test_assert( !tokenizer.end( ) );
    test_assert( tokenizer.next_token( ).data == "dog" );
    test_assert( !tokenizer.end( ) );
    test_assert( tokenizer.next_token( ).data == "!" );
    test_assert( !tokenizer.end( ) );
    test_assert( tokenizer.next_token( ).data == "" );
    test_assert( tokenizer.end( ) );

    return true;
};

enum class DictionaryEntryType
{
    verb,
    phresal_verb,
    invalid,
    any = invalid
};

struct DictionaryEntry
{
    DictionaryEntryType type;
    std::vector< std::string > tokens;
};

class Dictionary
{
public:
    using const_iterator = std::vector< DictionaryEntry >::const_iterator;
    using const_iterators_range = std::pair< const_iterator, const_iterator >;

public:
    explicit Dictionary( std::vector< DictionaryEntry > data )
    {
        this->data = std::move( data );
        std::sort( this->data.begin( ), this->data.end( ), compare_tokens_less );
    }

    const_iterator
    end( ) const
    {
        return this->data.end( );
    }

private:
    static bool
    compare_tokens_less( const DictionaryEntry& lhs, const DictionaryEntry& rhs )
    {
        return ( lhs.type < rhs.type ) || ( lhs.type == rhs.type &&  lhs.tokens < rhs.tokens );
    }

    static bool
    compare_by_first_token( const DictionaryEntry& lhs, const DictionaryEntry& rhs )
    {
        if ( lhs.type < rhs.type )
        {
            return true;
        }
        else if ( lhs.type == rhs.type )
        {
            if ( lhs.tokens.empty( ) || rhs.tokens.empty( ) )
            {
                return lhs.tokens.empty( ) < rhs.tokens.empty( );
            }
            return lhs.tokens[ 0 ] < rhs.tokens[ 0 ];
        }
        else
        {
            // lhs.type > rhs.type
            return false;
        }
    }

public:
    const_iterator
    look_up_phresal( const std::vector< std::string >& tokens ) const
    {
        if ( tokens.empty( ) )
            return end( );

        std::string tokens_str;
        if ( g_verbose )
        {
            for ( auto t : tokens )
                tokens_str += t + ", ";
            tokens_str.resize( tokens_str.size( ) - 2 );
        }

        DictionaryEntry search_value{DictionaryEntryType::phresal_verb, tokens};
        auto lb_it
            = std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less );
        if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) )
        {
            if ( g_verbose )
            {
                verbose_msg( "look_up() succeeded: %s", tokens_str.c_str( ) );
            }
            return lb_it;
        }

        if ( g_verbose )
        {
            verbose_msg( "look_up() failed: %s", tokens_str.c_str( ) );
        }

        return end( );
    }

    std::pair< const_iterator, const_iterator >
    lookup_by_first_word( const std::string& word ) const
    {
        DictionaryEntry search_value{DictionaryEntryType::phresal_verb, {word}};
        auto lb_it
            = std::lower_bound( data.begin( ), data.end( ), search_value, compare_by_first_token );

        if ( lb_it != data.end( ) && lb_it->type == DictionaryEntryType::phresal_verb
             && !lb_it->tokens.empty( ) && lb_it->tokens[ 0 ] == word )
        {
            auto ub_it = std::upper_bound( data.begin( ), data.end( ), search_value,
                                           compare_by_first_token );
            return {lb_it, ub_it};
        }
        else
        {
            return {end( ), end( )};
        }
    }

    const_iterator
    lookup_for_verb( const std::string& word ) const
    {
        DictionaryEntry search_value{DictionaryEntryType::verb, {word}};

        auto lb_it
            = std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less );
        if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) )
        {
            return lb_it;
        }
        return end( );
    }

private:
    std::vector< DictionaryEntry > data;
};

std::vector< DictionaryEntry >
get_hardcoded_dictionary( )
{
    static const std::vector< DictionaryEntry > result = {
        {DictionaryEntryType::phresal_verb, {"get", "on"}},
        {DictionaryEntryType::phresal_verb, {"get", "off"}},
        {DictionaryEntryType::phresal_verb, {"try", "on"}},
        {DictionaryEntryType::phresal_verb, {"take", "off"}},
        {DictionaryEntryType::phresal_verb, {"look", "up"}},
        {DictionaryEntryType::phresal_verb, {"get", "up"}},
        {DictionaryEntryType::phresal_verb, {"get", "back"}},
        {DictionaryEntryType::phresal_verb, {"pick", "up"}},
    };
    return result;
}

class PhresalsMatcher
{
public:
    struct MatchInfo
    {
        DictionaryEntry phresal;
        size_t column, row, offset;
    };

    using ReportResultCb = std::function< void( const MatchInfo& match_info ) >;

    PhresalsMatcher( TextTokenizer tokenizer, const Dictionary& dictionary, ReportResultCb cb )
        : tokenizer( tokenizer )
        , dictionary( dictionary )
        , cb( cb )
    {
    }

    // TODO: 1) Yield position, row and column.
    //      2) check if phresal is split by regular verb e.g `get xxx acquire by' should not match
    //      `get by` here because in between we have verb. 3) the same as (2) but check for
    //      punctuation marks.
    void
    match_all( )
    {
        std::string first_phresal_token, second_phresal_token;
        bool in_phresal = false;
        Dictionary::const_iterators_range lookup_range_result{dictionary.end( ), dictionary.end( )};
        while ( !tokenizer.end( ) )
        {
            auto token = tokenizer.next_token( );
            Dictionary::const_iterator lookup_result = dictionary.end( );
            if ( !in_phresal )
            {
                verbose_msg( "lookup verb '%s'", token.data.c_str( ) );
                lookup_range_result = dictionary.lookup_by_first_word( token.data );
                if ( lookup_range_result.first != dictionary.end( ) )
                {
                    unsigned i = 0;
                    for ( auto it = lookup_range_result.first; it != lookup_range_result.second;
                          ++it )
                    {
                        verbose_msg( "%d: lookup verb '%s' SUCCESS: [%s:%s]", i++,
                                     token.data.c_str( ), it->tokens[ 0 ].c_str( ),
                                     it->tokens[ 1 ].c_str( ) );
                    }

                    // first word has matched to one or several phresals
                    in_phresal = true;
                }
                else
                {
                    verbose_msg( "status == notinphresal, lookup verb '%s' FAILED",
                                 token.data.c_str( ) );
                }
            }
            else
            {
                // in phresal

                // check if current tokens breaks current phresal match. in cat be due to regular
                // verb, or due to punctuation mark (simplified heuristics).

                if ( token.type == TextTokenizer::TokenType::Punctuation )
                {
                    verbose_msg( "in phresal false due to breaking punctuation mark: [%s]!",
                                 token.data.c_str( ) );
                    // current match broken
                    in_phresal = false;
                }
                else if ( ( lookup_result = dictionary.lookup_for_verb( token.data ) )
                          != dictionary.end( ) )
                {
                    assert( lookup_result->type == DictionaryEntryType::verb );
                    // current match broken
                    in_phresal = false;

                    verbose_msg( "in phresal false due to breaking VERB %s!",
                                 lookup_result->tokens[ 0 ].c_str( ) );
                    // but this verb can be beginning of other phresal, will be handled in next
                    // iteration.
                    tokenizer.put_token_back( std::move( token ) );
                }
                else
                {
                    verbose_msg(
                        "in phresal, trying matching second word! current token is [%s], there are "
                        "%d candidatees",
                        token.data.c_str( ),
                        (int)std::distance( lookup_range_result.first, lookup_range_result.second ) );

                    // no, this word does not break our current match, lets try to match second
                    // token of the phresal.

                    for ( auto it = lookup_range_result.first; it != lookup_range_result.second;
                          ++it )
                    {
                        assert( it->type == DictionaryEntryType::phresal_verb );
                        assert( it->tokens.size( ) == 2 );
                        verbose_msg( "testing token: %s", it->tokens[ 1 ].c_str( ) );

                        if ( token.data == it->tokens[ 1 ] )
                        {
                            verbose_msg( "matched second word of phresal" );
                            cb( MatchInfo{DictionaryEntry{DictionaryEntryType::phresal_verb,
                                                          {it->tokens[ 0 ], it->tokens[ 1 ]}},
                                          0, 0, 0} );
                            in_phresal = false;
                            break;
                        }
                    }
                }
            }
        }
    }

private:
    TextTokenizer tokenizer;
    const Dictionary& dictionary;
    ReportResultCb cb;
};

}  // namespace phresal_matcher_lib

void
run_tests( )
{
    using namespace phresal_matcher_lib;

    auto run_all = []( std::vector< std::pair< const char*, std::function< bool( ) > > > tests ) {
        std::vector< std::string > passed, failed;
        for ( auto test : tests )
        {
            info_msg( "running test case: %s", test.first );
            if ( test.second( ) )
            {
                info_msg( "%s passed.\n", test.first );
                passed.emplace_back( std::string( test.first ) );
            }
            else
            {
                failed.emplace_back( std::string( test.first ) );
            }
        }

        info_msg( "*** Summary ***\n" );
        info_msg( "Passed tests:" );
        for ( auto t : passed )
        {
            std::printf( "\t%s\n", t.c_str( ) );
        }
        if ( !failed.empty( ) )
        {
            info_msg( "Failed tests:" );
            for ( auto t : failed )
            {
                std::printf( "\t%s\n", t.c_str( ) );
            }
        }
        else
        {
            info_msg( "No failed tests.\n" );
        }
    };

    auto dictionary_basic_test = []( ) -> bool {
        Dictionary dict{get_hardcoded_dictionary( )};
        test_assert( dict.look_up_phresal( {"look", "up"} ) != dict.end( ) );
        test_assert( dict.look_up_phresal( {"look", "up"} )->type
                     == DictionaryEntryType::phresal_verb );
        test_assert( dict.look_up_phresal( {"fuck", "off"} ) == dict.end( ) );

        //  TODO: test case when queried by first token phresal has multiple choices.
        //  TODO: test lookup_verb

        return true;
    };

    auto matcher_basic_test = []( ) -> bool {
        std::vector< std::string > matched_phresals;

        Dictionary dict{{
            {DictionaryEntryType::phresal_verb, {"get", "up"}},
            {DictionaryEntryType::phresal_verb, {"get", "down"}},
            {DictionaryEntryType::phresal_verb, {"try", "on"}},
            {DictionaryEntryType::phresal_verb, {"set", "off"}},
            {DictionaryEntryType::verb, {"walk"}},  // Used as stopper for set off
            {DictionaryEntryType::phresal_verb, {"make", "up"}},
            {DictionaryEntryType::phresal_verb, {"make", "off"}},

            // Without regular verbs that appear as first words of phresal verbs, nothing will work
            // as expected
            {DictionaryEntryType::verb, {"walk"}},  // Used as stopper for set off
            {DictionaryEntryType::verb, {"get"}},
        }};
        std::string text
            // 1) Regular `get up` match
            = "I get up in the monning.\n"

              // 2) .. try ... on ... pattern
              "I always try new things on.\n"

              // 3) Set off is not matched because of `walk` which is regular word
              "I set these walk off\n"

              // 4) After previous test case, check state is recovered and we still can match cases
              // like (2).
              "I also make my bed up every day"
              "\n"

              // 5) .. get XXX get up. Must match latest `get up` (todo: check offset).
              "I get XXX get up"
              "\n"

              // 6) .. get XXX get YYY up. Must match latest `get up` like in case (2).
              "I get XXX get things up"
              "\n"

              // 7) Phresal verb broken by punctuation mark does not count.
              "I get . up"
              "\n"

              // 8) Phresal verb broken by punctuation mark does not count.
              "I get ; up"
              "\n"

              // 9) Phresal verb broken by punctuation mark does not count.
              "I get, up"
              "\n"

              // 10) Phresal verb broken by punctuation mark does not count.
              "I get ! up"
              "\n";

        TextTokenizer tokenizer( text, WHITESPACES, DELIMITERS );
        PhresalsMatcher matcher{tokenizer, dict,
                                [&matched_phresals]( const PhresalsMatcher::MatchInfo& info ) {
                                    verbose_msg( "match" );
                                    matched_phresals.push_back( info.phresal.tokens[ 0 ] + " "
                                                                + info.phresal.tokens[ 1 ] );
                                }};
        matcher.match_all( );
        test_eq( 5, matched_phresals.size( ) );

        verbose_msg( "\nmatched phresals:" );
        for ( auto phresal : matched_phresals )
        {
            verbose_msg( "%s", phresal.c_str( ) );
        }

        test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" )
                     != matched_phresals.end( ) );
        test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "try on" )
                     != matched_phresals.end( ) );
        test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "set off" )
                     == matched_phresals.end( ) );
        test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "make up" )
                     != matched_phresals.end( ) );
        test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" )
                     != matched_phresals.end( ) );

        return true;
    };

    run_all( {
        {"tokenizer_basic_test", tokenizer_basic_test},
        {"dictionary_basic_test", dictionary_basic_test},
        {"matcher_basic_test", matcher_basic_test},
        //              { "file_loader_test", file_loader_test },
        //              { "integration_file_tokenizing_test", integration_file_tokenizing_test }
    } );
}

namespace application
{
bool
parse_command_line_options( int argc,
                            char* argv[],
                            std::string& out_file,
                            std::string& out_dict,
                            bool& out_verbose,
                            bool& out_tests )
{
    if ( argc < 2 )
    {
        err_msg( "In non test mode at least two options needed: input file and dictionary.\n" );
        return false;
    }

    if ( argc == 2 )
    {
        if ( std::string( argv[ 1 ] ) == "-t" )
        {
            out_tests = true;
            out_verbose = false;
            return true;
        }
        else
        {
            err_msg( "At least two options needed: input file and dictionary.\n" );
            return false;
        }
    }

    bool only_flags_accepted = false;

    std::string file, dict;
    bool verbose = false, tests = false;

    for ( size_t argn = 1U; argn < argc; ++argn )
    {
        if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-f" )
        {
            if ( argn + 1 < argc )
            {
                file = argv[ argn + 1 ];
            }
            else
            {
                err_msg( "-f should follow file path.\n" );
                return false;
            }
        }
        else if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-d" )
        {
            if ( argn + 1 < argc )
            {
                dict = argv[ argn + 1 ];
            }
            else
            {
                err_msg( "-d should follow dictionary file path.\n" );
                return false;
            }
        }
        else if ( std::string( argv[ argn ] ) == "-v" )
        {
            verbose = true;
        }
        else if ( std::string( argv[ argn ] ) == "-t" )
        {
            tests = true;
        }

        if ( !dict.empty( ) && !file.empty( ) )
        {
            only_flags_accepted = true;
        }
    }

    if ( tests )
    {
        if ( !dict.empty( ) || !file.empty( ) )
        {
            warn_msg( "dict and file options are ingored in test mode\n" );
        }
        out_verbose = verbose;
        out_tests = true;
        return true;
    }
    else if ( !dict.empty( ) && !file.empty( ) )
    {
        out_file = std::move( file );
        out_dict = std::move( dict );
        out_verbose = verbose;
        out_tests = false;

        return true;
    }
    else
    {
        err_msg(
            "-f and -d are mandatory command line options in non-test run. Please specify them or "
            "specify test mode\n" );
        return false;
    }
}

bool
load_text_from_file( const std::string& file_path, std::string& out_text )
{
    std::ifstream file_stream( file_path, std::ios::in | std::ios::in );
    if ( !file_stream )
    {
        return false;
    }

    file_stream.seekg( 0, file_stream.end );
    size_t file_size = file_stream.tellg( );
    file_stream.seekg( 0, file_stream.beg );

    std::string text( file_size, '\0' );
    file_stream.read( const_cast< char* >( text.data( ) ), file_size );

    if ( file_stream )
    {
        out_text = text;
        verbose_msg( "read text: %s", text.c_str( ) );
        return true;
    }

    return false;
}

namespace
{
std::string
file_name_from_file_path( std::string fpath )
{
    size_t pos = fpath.find_last_of( '\\' );
    if ( pos != std::string::npos )
    {
        return std::string( fpath, pos + 1, fpath.size( ) - pos );
    }
    return fpath;
}
}

void
show_usage( std::string self_name )
{
    std::printf( "Phresal verb finder. (c) lsem, 2017\n" );
    std::printf( "USAGE:\n" );
    std::printf( "%3s%s OPTIONS\n\n", "", self_name.c_str( ) );
    std::printf( "Available options:\n" );
    const char* option_format_str = "%5s%-5s%-15s%-35s\n";
    std::printf( option_format_str, "-f", "", "FILE PATH", "Path to input file." );
    std::printf( option_format_str, "-d", "", "FILE PATH", "Path to dictionary file." );
    std::printf( option_format_str, "-t", "", "",
                 "Flag makes the app run tests instead of running app." );
    std::printf( option_format_str, "-v", "", "", "Flag enables verbose mode." );
}
}

int
main( int argc, char* argv[] )
{
    std::string file_path, dict_file_path;
    bool verbose, tests;
    if ( !application::parse_command_line_options( argc, argv,
                                                   /*out*/ file_path,
                                                   /*out*/ dict_file_path,
                                                   /*out*/ verbose,
                                                   /*out*/ tests ) )
    {
        application::show_usage( application::file_name_from_file_path( argv[ 0 ] ) );
        return 1;
    }

    g_verbose = verbose;

    if ( tests )
    {
        run_tests( );
        return 0;
    }
    else
    {
        verbose_msg( "effective file path `%s`", file_path.c_str( ) );
        verbose_msg( "effective dict path: `%s`", dict_file_path.c_str( ) );

        verbose_msg( "Loading text from file `%s`", file_path.c_str( ) );
        std::string text;
        if ( !application::load_text_from_file( file_path, text ) )
        {
            err_msg( "failed loading text from file" );
            return 1;
        }

        auto match_cb = []( const phresal_matcher_lib::PhresalsMatcher::MatchInfo& /*info*/ ) {
            info_msg( "Found phresal verb at position %d", -1 );
        };

        phresal_matcher_lib::TextTokenizer tokenizer{text, WHITESPACES, DELIMITERS};
        phresal_matcher_lib::Dictionary dict{phresal_matcher_lib::get_hardcoded_dictionary( )};
        phresal_matcher_lib::PhresalsMatcher matcher{tokenizer, dict, match_cb};

        matcher.match_all( );

        return 0;
    }

    return 0;
}
	#include <algorithm>
	#include <cassert>
	#include <fstream>
	#include <functional>
	#include <iostream>
	#include <list>
	#include <map>
	#include <set>
	#include <string>
	#include <vector>

	// Make all asserts runtime.
	#ifdef _NDEBUG
	#undef _NDEBUG
	#endif

	static const char* const WHITESPACES = "\t \n\0";
	static const char* const DELIMITERS = ",.;?!";

	namespace
	{
	bool g_verbose = true;
	#define verbose_msg( msg, ... ) \
	do \
	{ \
	if ( g_verbose ) \
	std::printf( "VRB: " msg "\n", __VA_ARGS__ ); \
	} while ( false )
	#define err_msg( msg, ... ) std::fprintf( stderr, "ERR: " msg "\n", __VA_ARGS__ );
	#define warn_msg( msg, ... ) std::printf( "WARN: " msg "\n", __VA_ARGS__ );
	#define info_msg( msg, ... ) std::printf( "" msg "\n", __VA_ARGS__ );
	}

	#define test_assert( Condition ) \
	do \
	{ \
	if ( !( Condition ) ) \
	{ \
	std::printf( "test condition failed: `%s` at line %d\n", #Condition, __LINE__ ); \
	return false; \
	} \
	} while ( false );

	#define test_eq( Expected, Actual ) \
	do \
	{ \
	auto expected = Expected; \
	auto actual = Actual; \
	if ( expected != actual ) \
	{ \
	std::printf( "test condition failed: Expected %s (%s) != Actual %s (%s) at line %d\n", \
	#Expected, std::to_string( expected ).c_str( ), #Actual, \
	std::to_string( actual ).c_str( ), __LINE__ ); \
	return false; \
	} \
	} while ( false );

	#define DISABLED_TEST_CASES ;

	namespace phresal_matcher_lib
	{
	class TextTokenizer
	{
	public:
	enum class TokenType
	{
	Regular,
	Punctuation,
	End
	};

	struct Token
	{
	TokenType type;
	std::string data;
	};

	public:
	explicit TextTokenizer( const std::string& text,
	std::string whitespaces_chars,
	std::string punctuation_mark_chars )
	{
	// place \n as terminator which is delimiter to handle last token.
	this->text = text + "\n";
	this->offset = 0;
	this->whitespaces_chars = std::move( whitespaces_chars );
	this->punctuation_mark_chars = std::move( punctuation_mark_chars );
	}

	void
	skip_whitespaces( )
	{
	this->offset = this->text.find_first_not_of( this->whitespaces_chars, this->offset );
	}

	void
	skip_punctuation_marks( )
	{
	this->offset = this->text.find_first_not_of( this->punctuation_mark_chars, this->offset );
	}

	Token
	next_token( )
	{
	if ( !returned_tokens.empty( ) )
	{
	auto result = returned_tokens.back( );
	returned_tokens.pop_back( );
	return result;
	}

	// find next token start position
	skip_whitespaces( );

	if ( !end( ) )
	{
	if ( this->punctuation_mark_chars.find( this->text[ this->offset ] )
	!= std::string::npos )
	{
	std::string tok;
	tok += this->text[ this->offset ];
	this->offset += 1;
	return {TokenType::Punctuation, tok};
	}
	}

	if ( !end( ) )
	{
	size_t token_end_pos
	= this->text.find_first_of( TextTokenizer::separators( ), this->offset );
	if ( token_end_pos != std::string::npos )
	{
	std::string token( this->text, this->offset, token_end_pos - this->offset );
	this->offset = token_end_pos;
	verbose_msg( "yield(%s)", token.c_str( ) );
	return {TokenType::Regular, token};
	}
	}
	return {TokenType::End, ""};
	}

	void
	put_token_back( Token token )
	{
	returned_tokens.push_back( std::move( token ) );
	}

	bool
	end( ) const
	{
	return this->offset == this->text.size( ) \|\| this->offset == std::string::npos;
	}

	const std::string&
	separators( ) const
	{
	static std::string separators_chars = whitespaces_chars + punctuation_mark_chars;
	return separators_chars;
	}

	private:
	std::list< Token > returned_tokens;
	std::string whitespaces_chars, punctuation_mark_chars;
	std::string text;
	size_t offset;
	};

	bool
	tokenizer_basic_test( )
	{
	TextTokenizer tokenizer( " Quick brown fox, jumps over the lazy dog!", WHITESPACES,
	DELIMITERS );

	test_assert( tokenizer.end( ) == false );

	{
	auto t = tokenizer.next_token( );
	test_assert( t.data == "Quick" );
	test_assert( t.type == TextTokenizer::TokenType::Regular );
	test_assert( !tokenizer.end( ) );
	}

	test_assert( tokenizer.next_token( ).data == "brown" );
	test_assert( !tokenizer.end( ) );
	test_assert( tokenizer.next_token( ).data == "fox" );
	test_assert( !tokenizer.end( ) );

	{
	auto t = tokenizer.next_token( );
	test_assert( t.data == "," );
	test_assert( t.type == TextTokenizer::TokenType::Punctuation );
	test_assert( !tokenizer.end( ) );
	}

	test_assert( tokenizer.next_token( ).data == "jumps" );
	test_assert( !tokenizer.end( ) );

	test_assert( tokenizer.next_token( ).data == "over" );
	test_assert( !tokenizer.end( ) );
	test_assert( tokenizer.next_token( ).data == "the" );
	test_assert( !tokenizer.end( ) );

	tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "the"} );
	tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "over"} );
	test_assert( tokenizer.next_token( ).data == "over" );
	test_assert( !tokenizer.end( ) );
	test_assert( tokenizer.next_token( ).data == "the" );
	test_assert( !tokenizer.end( ) );

	test_assert( tokenizer.next_token( ).data == "lazy" );
	test_assert( !tokenizer.end( ) );
	test_assert( tokenizer.next_token( ).data == "dog" );
	test_assert( !tokenizer.end( ) );
	test_assert( tokenizer.next_token( ).data == "!" );
	test_assert( !tokenizer.end( ) );
	test_assert( tokenizer.next_token( ).data == "" );
	test_assert( tokenizer.end( ) );

	return true;
	};

	enum class DictionaryEntryType
	{
	verb,
	phresal_verb,
	invalid,
	any = invalid
	};

	struct DictionaryEntry
	{
	DictionaryEntryType type;
	std::vector< std::string > tokens;
	};

	class Dictionary
	{
	public:
	using const_iterator = std::vector< DictionaryEntry >::const_iterator;
	using const_iterators_range = std::pair< const_iterator, const_iterator >;

	public:
	explicit Dictionary( std::vector< DictionaryEntry > data )
	{
	this->data = std::move( data );
	std::sort( this->data.begin( ), this->data.end( ), compare_tokens_less );
	}

	const_iterator
	end( ) const
	{
	return this->data.end( );
	}

	private:
	static bool
	compare_tokens_less( const DictionaryEntry& lhs, const DictionaryEntry& rhs )
	{
	return ( lhs.type < rhs.type ) \|\| ( lhs.type == rhs.type && lhs.tokens < rhs.tokens );
	}

	static bool
	compare_by_first_token( const DictionaryEntry& lhs, const DictionaryEntry& rhs )
	{
	if ( lhs.type < rhs.type )
	{
	return true;
	}
	else if ( lhs.type == rhs.type )
	{
	if ( lhs.tokens.empty( ) \|\| rhs.tokens.empty( ) )
	{
	return lhs.tokens.empty( ) < rhs.tokens.empty( );
	}
	return lhs.tokens[ 0 ] < rhs.tokens[ 0 ];
	}
	else
	{
	// lhs.type > rhs.type
	return false;
	}
	}

	public:
	const_iterator
	look_up_phresal( const std::vector< std::string >& tokens ) const
	{
	if ( tokens.empty( ) )
	return end( );

	std::string tokens_str;
	if ( g_verbose )
	{
	for ( auto t : tokens )
	tokens_str += t + ", ";
	tokens_str.resize( tokens_str.size( ) - 2 );
	}

	DictionaryEntry search_value{DictionaryEntryType::phresal_verb, tokens};
	auto lb_it
	= std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less );
	if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) )
	{
	if ( g_verbose )
	{
	verbose_msg( "look_up() succeeded: %s", tokens_str.c_str( ) );
	}
	return lb_it;
	}

	if ( g_verbose )
	{
	verbose_msg( "look_up() failed: %s", tokens_str.c_str( ) );
	}

	return end( );
	}

	std::pair< const_iterator, const_iterator >
	lookup_by_first_word( const std::string& word ) const
	{
	DictionaryEntry search_value{DictionaryEntryType::phresal_verb, {word}};
	auto lb_it
	= std::lower_bound( data.begin( ), data.end( ), search_value, compare_by_first_token );

	if ( lb_it != data.end( ) && lb_it->type == DictionaryEntryType::phresal_verb
	&& !lb_it->tokens.empty( ) && lb_it->tokens[ 0 ] == word )
	{
	auto ub_it = std::upper_bound( data.begin( ), data.end( ), search_value,
	compare_by_first_token );
	return {lb_it, ub_it};
	}
	else
	{
	return {end( ), end( )};
	}
	}

	const_iterator
	lookup_for_verb( const std::string& word ) const
	{
	DictionaryEntry search_value{DictionaryEntryType::verb, {word}};

	auto lb_it
	= std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less );
	if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) )
	{
	return lb_it;
	}
	return end( );
	}

	private:
	std::vector< DictionaryEntry > data;
	};

	std::vector< DictionaryEntry >
	get_hardcoded_dictionary( )
	{
	static const std::vector< DictionaryEntry > result = {
	{DictionaryEntryType::phresal_verb, {"get", "on"}},
	{DictionaryEntryType::phresal_verb, {"get", "off"}},
	{DictionaryEntryType::phresal_verb, {"try", "on"}},
	{DictionaryEntryType::phresal_verb, {"take", "off"}},
	{DictionaryEntryType::phresal_verb, {"look", "up"}},
	{DictionaryEntryType::phresal_verb, {"get", "up"}},
	{DictionaryEntryType::phresal_verb, {"get", "back"}},
	{DictionaryEntryType::phresal_verb, {"pick", "up"}},
	};
	return result;
	}

	class PhresalsMatcher
	{
	public:
	struct MatchInfo
	{
	DictionaryEntry phresal;
	size_t column, row, offset;
	};

	using ReportResultCb = std::function< void( const MatchInfo& match_info ) >;

	PhresalsMatcher( TextTokenizer tokenizer, const Dictionary& dictionary, ReportResultCb cb )
	: tokenizer( tokenizer )
	, dictionary( dictionary )
	, cb( cb )
	{
	}

	// TODO: 1) Yield position, row and column.
	// 2) check if phresal is split by regular verb e.g `get xxx acquire by' should not match
	// `get by` here because in between we have verb. 3) the same as (2) but check for
	// punctuation marks.
	void
	match_all( )
	{
	std::string first_phresal_token, second_phresal_token;
	bool in_phresal = false;
	Dictionary::const_iterators_range lookup_range_result{dictionary.end( ), dictionary.end( )};
	while ( !tokenizer.end( ) )
	{
	auto token = tokenizer.next_token( );
	Dictionary::const_iterator lookup_result = dictionary.end( );
	if ( !in_phresal )
	{
	verbose_msg( "lookup verb '%s'", token.data.c_str( ) );
	lookup_range_result = dictionary.lookup_by_first_word( token.data );
	if ( lookup_range_result.first != dictionary.end( ) )
	{
	unsigned i = 0;
	for ( auto it = lookup_range_result.first; it != lookup_range_result.second;
	++it )
	{
	verbose_msg( "%d: lookup verb '%s' SUCCESS: [%s:%s]", i++,
	token.data.c_str( ), it->tokens[ 0 ].c_str( ),
	it->tokens[ 1 ].c_str( ) );
	}

	// first word has matched to one or several phresals
	in_phresal = true;
	}
	else
	{
	verbose_msg( "status == notinphresal, lookup verb '%s' FAILED",
	token.data.c_str( ) );
	}
	}
	else
	{
	// in phresal

	// check if current tokens breaks current phresal match. in cat be due to regular
	// verb, or due to punctuation mark (simplified heuristics).

	if ( token.type == TextTokenizer::TokenType::Punctuation )
	{
	verbose_msg( "in phresal false due to breaking punctuation mark: [%s]!",
	token.data.c_str( ) );
	// current match broken
	in_phresal = false;
	}
	else if ( ( lookup_result = dictionary.lookup_for_verb( token.data ) )
	!= dictionary.end( ) )
	{
	assert( lookup_result->type == DictionaryEntryType::verb );
	// current match broken
	in_phresal = false;

	verbose_msg( "in phresal false due to breaking VERB %s!",
	lookup_result->tokens[ 0 ].c_str( ) );
	// but this verb can be beginning of other phresal, will be handled in next
	// iteration.
	tokenizer.put_token_back( std::move( token ) );
	}
	else
	{
	verbose_msg(
	"in phresal, trying matching second word! current token is [%s], there are "
	"%d candidatees",
	token.data.c_str( ),
	(int)std::distance( lookup_range_result.first, lookup_range_result.second ) );

	// no, this word does not break our current match, lets try to match second
	// token of the phresal.

	for ( auto it = lookup_range_result.first; it != lookup_range_result.second;
	++it )
	{
	assert( it->type == DictionaryEntryType::phresal_verb );
	assert( it->tokens.size( ) == 2 );
	verbose_msg( "testing token: %s", it->tokens[ 1 ].c_str( ) );

	if ( token.data == it->tokens[ 1 ] )
	{
	verbose_msg( "matched second word of phresal" );
	cb( MatchInfo{DictionaryEntry{DictionaryEntryType::phresal_verb,
	{it->tokens[ 0 ], it->tokens[ 1 ]}},
	0, 0, 0} );
	in_phresal = false;
	break;
	}
	}
	}
	}
	}
	}

	private:
	TextTokenizer tokenizer;
	const Dictionary& dictionary;
	ReportResultCb cb;
	};

	} // namespace phresal_matcher_lib

	void
	run_tests( )
	{
	using namespace phresal_matcher_lib;

	auto run_all = []( std::vector< std::pair< const char*, std::function< bool( ) > > > tests ) {
	std::vector< std::string > passed, failed;
	for ( auto test : tests )
	{
	info_msg( "running test case: %s", test.first );
	if ( test.second( ) )
	{
	info_msg( "%s passed.\n", test.first );
	passed.emplace_back( std::string( test.first ) );
	}
	else
	{
	failed.emplace_back( std::string( test.first ) );
	}
	}

	info_msg( "* Summary *\n" );
	info_msg( "Passed tests:" );
	for ( auto t : passed )
	{
	std::printf( "\t%s\n", t.c_str( ) );
	}
	if ( !failed.empty( ) )
	{
	info_msg( "Failed tests:" );
	for ( auto t : failed )
	{
	std::printf( "\t%s\n", t.c_str( ) );
	}
	}
	else
	{
	info_msg( "No failed tests.\n" );
	}
	};

	auto dictionary_basic_test = []( ) -> bool {
	Dictionary dict{get_hardcoded_dictionary( )};
	test_assert( dict.look_up_phresal( {"look", "up"} ) != dict.end( ) );
	test_assert( dict.look_up_phresal( {"look", "up"} )->type
	== DictionaryEntryType::phresal_verb );
	test_assert( dict.look_up_phresal( {"fuck", "off"} ) == dict.end( ) );

	// TODO: test case when queried by first token phresal has multiple choices.
	// TODO: test lookup_verb

	return true;
	};

	auto matcher_basic_test = []( ) -> bool {
	std::vector< std::string > matched_phresals;

	Dictionary dict{{
	{DictionaryEntryType::phresal_verb, {"get", "up"}},
	{DictionaryEntryType::phresal_verb, {"get", "down"}},
	{DictionaryEntryType::phresal_verb, {"try", "on"}},
	{DictionaryEntryType::phresal_verb, {"set", "off"}},
	{DictionaryEntryType::verb, {"walk"}}, // Used as stopper for set off
	{DictionaryEntryType::phresal_verb, {"make", "up"}},
	{DictionaryEntryType::phresal_verb, {"make", "off"}},

	// Without regular verbs that appear as first words of phresal verbs, nothing will work
	// as expected
	{DictionaryEntryType::verb, {"walk"}}, // Used as stopper for set off
	{DictionaryEntryType::verb, {"get"}},
	}};
	std::string text
	// 1) Regular `get up` match
	= "I get up in the monning.\n"

	// 2) .. try ... on ... pattern
	"I always try new things on.\n"

	// 3) Set off is not matched because of `walk` which is regular word
	"I set these walk off\n"

	// 4) After previous test case, check state is recovered and we still can match cases
	// like (2).
	"I also make my bed up every day"
	"\n"

	// 5) .. get XXX get up. Must match latest `get up` (todo: check offset).
	"I get XXX get up"
	"\n"

	// 6) .. get XXX get YYY up. Must match latest `get up` like in case (2).
	"I get XXX get things up"
	"\n"

	// 7) Phresal verb broken by punctuation mark does not count.
	"I get . up"
	"\n"

	// 8) Phresal verb broken by punctuation mark does not count.
	"I get ; up"
	"\n"

	// 9) Phresal verb broken by punctuation mark does not count.
	"I get, up"
	"\n"

	// 10) Phresal verb broken by punctuation mark does not count.
	"I get ! up"
	"\n";

	TextTokenizer tokenizer( text, WHITESPACES, DELIMITERS );
	PhresalsMatcher matcher{tokenizer, dict,
	[&matched_phresals]( const PhresalsMatcher::MatchInfo& info ) {
	verbose_msg( "match" );
	matched_phresals.push_back( info.phresal.tokens[ 0 ] + " "
	+ info.phresal.tokens[ 1 ] );
	}};
	matcher.match_all( );
	test_eq( 5, matched_phresals.size( ) );

	verbose_msg( "\nmatched phresals:" );
	for ( auto phresal : matched_phresals )
	{
	verbose_msg( "%s", phresal.c_str( ) );
	}

	test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" )
	!= matched_phresals.end( ) );
	test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "try on" )
	!= matched_phresals.end( ) );
	test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "set off" )
	== matched_phresals.end( ) );
	test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "make up" )
	!= matched_phresals.end( ) );
	test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" )
	!= matched_phresals.end( ) );

	return true;
	};

	run_all( {
	{"tokenizer_basic_test", tokenizer_basic_test},
	{"dictionary_basic_test", dictionary_basic_test},
	{"matcher_basic_test", matcher_basic_test},
	// { "file_loader_test", file_loader_test },
	// { "integration_file_tokenizing_test", integration_file_tokenizing_test }
	} );
	}

	namespace application
	{
	bool
	parse_command_line_options( int argc,
	char* argv[],
	std::string& out_file,
	std::string& out_dict,
	bool& out_verbose,
	bool& out_tests )
	{
	if ( argc < 2 )
	{
	err_msg( "In non test mode at least two options needed: input file and dictionary.\n" );
	return false;
	}

	if ( argc == 2 )
	{
	if ( std::string( argv[ 1 ] ) == "-t" )
	{
	out_tests = true;
	out_verbose = false;
	return true;
	}
	else
	{
	err_msg( "At least two options needed: input file and dictionary.\n" );
	return false;
	}
	}

	bool only_flags_accepted = false;

	std::string file, dict;
	bool verbose = false, tests = false;

	for ( size_t argn = 1U; argn < argc; ++argn )
	{
	if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-f" )
	{
	if ( argn + 1 < argc )
	{
	file = argv[ argn + 1 ];
	}
	else
	{
	err_msg( "-f should follow file path.\n" );
	return false;
	}
	}
	else if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-d" )
	{
	if ( argn + 1 < argc )
	{
	dict = argv[ argn + 1 ];
	}
	else
	{
	err_msg( "-d should follow dictionary file path.\n" );
	return false;
	}
	}
	else if ( std::string( argv[ argn ] ) == "-v" )
	{
	verbose = true;
	}
	else if ( std::string( argv[ argn ] ) == "-t" )
	{
	tests = true;
	}

	if ( !dict.empty( ) && !file.empty( ) )
	{
	only_flags_accepted = true;
	}
	}

	if ( tests )
	{
	if ( !dict.empty( ) \|\| !file.empty( ) )
	{
	warn_msg( "dict and file options are ingored in test mode\n" );
	}
	out_verbose = verbose;
	out_tests = true;
	return true;
	}
	else if ( !dict.empty( ) && !file.empty( ) )
	{
	out_file = std::move( file );
	out_dict = std::move( dict );
	out_verbose = verbose;
	out_tests = false;

	return true;
	}
	else
	{
	err_msg(
	"-f and -d are mandatory command line options in non-test run. Please specify them or "
	"specify test mode\n" );
	return false;
	}
	}

	bool
	load_text_from_file( const std::string& file_path, std::string& out_text )
	{
	std::ifstream file_stream( file_path, std::ios::in \| std::ios::in );
	if ( !file_stream )
	{
	return false;
	}

	file_stream.seekg( 0, file_stream.end );
	size_t file_size = file_stream.tellg( );
	file_stream.seekg( 0, file_stream.beg );

	std::string text( file_size, '\0' );
	file_stream.read( const_cast< char* >( text.data( ) ), file_size );

	if ( file_stream )
	{
	out_text = text;
	verbose_msg( "read text: %s", text.c_str( ) );
	return true;
	}

	return false;
	}

	namespace
	{
	std::string
	file_name_from_file_path( std::string fpath )
	{
	size_t pos = fpath.find_last_of( '\\' );
	if ( pos != std::string::npos )
	{
	return std::string( fpath, pos + 1, fpath.size( ) - pos );
	}
	return fpath;
	}
	}

	void
	show_usage( std::string self_name )
	{
	std::printf( "Phresal verb finder. (c) lsem, 2017\n" );
	std::printf( "USAGE:\n" );
	std::printf( "%3s%s OPTIONS\n\n", "", self_name.c_str( ) );
	std::printf( "Available options:\n" );
	const char* option_format_str = "%5s%-5s%-15s%-35s\n";
	std::printf( option_format_str, "-f", "", "FILE PATH", "Path to input file." );
	std::printf( option_format_str, "-d", "", "FILE PATH", "Path to dictionary file." );
	std::printf( option_format_str, "-t", "", "",
	"Flag makes the app run tests instead of running app." );
	std::printf( option_format_str, "-v", "", "", "Flag enables verbose mode." );
	}
	}

	int
	main( int argc, char* argv[] )
	{
	std::string file_path, dict_file_path;
	bool verbose, tests;
	if ( !application::parse_command_line_options( argc, argv,
	/out/ file_path,
	/out/ dict_file_path,
	/out/ verbose,
	/out/ tests ) )
	{
	application::show_usage( application::file_name_from_file_path( argv[ 0 ] ) );
	return 1;
	}

	g_verbose = verbose;

	if ( tests )
	{
	run_tests( );
	return 0;
	}
	else
	{
	verbose_msg( "effective file path `%s`", file_path.c_str( ) );
	verbose_msg( "effective dict path: `%s`", dict_file_path.c_str( ) );

	verbose_msg( "Loading text from file `%s`", file_path.c_str( ) );
	std::string text;
	if ( !application::load_text_from_file( file_path, text ) )
	{
	err_msg( "failed loading text from file" );
	return 1;
	}

	auto match_cb = []( const phresal_matcher_lib::PhresalsMatcher::MatchInfo& /info/ ) {
	info_msg( "Found phresal verb at position %d", -1 );
	};

	phresal_matcher_lib::TextTokenizer tokenizer{text, WHITESPACES, DELIMITERS};
	phresal_matcher_lib::Dictionary dict{phresal_matcher_lib::get_hardcoded_dictionary( )};
	phresal_matcher_lib::PhresalsMatcher matcher{tokenizer, dict, match_cb};

	matcher.match_all( );

	return 0;
	}

	return 0;
	}