Skip to content

Instantly share code, notes, and snippets.

@lsem
Last active January 3, 2018 10:16
Show Gist options
  • Save lsem/2fb600bd67f557a0ff22f014410cf2d6 to your computer and use it in GitHub Desktop.
Save lsem/2fb600bd67f557a0ff22f014410cf2d6 to your computer and use it in GitHub Desktop.
PhresalsFinder
#include <algorithm>
#include <cassert>
#include <fstream>
#include <functional>
#include <iostream>
#include <list>
#include <map>
#include <set>
#include <string>
#include <vector>
// Make all asserts runtime.
#ifdef _NDEBUG
#undef _NDEBUG
#endif
static const char* const WHITESPACES = "\t \n\0";
static const char* const DELIMITERS = ",.;?!";
namespace
{
bool g_verbose = true;
#define verbose_msg( msg, ... ) \
do \
{ \
if ( g_verbose ) \
std::printf( "VRB: " msg "\n", __VA_ARGS__ ); \
} while ( false )
#define err_msg( msg, ... ) std::fprintf( stderr, "ERR: " msg "\n", __VA_ARGS__ );
#define warn_msg( msg, ... ) std::printf( "WARN: " msg "\n", __VA_ARGS__ );
#define info_msg( msg, ... ) std::printf( "" msg "\n", __VA_ARGS__ );
}
#define test_assert( Condition ) \
do \
{ \
if ( !( Condition ) ) \
{ \
std::printf( "test condition failed: `%s` at line %d\n", #Condition, __LINE__ ); \
return false; \
} \
} while ( false );
#define test_eq( Expected, Actual ) \
do \
{ \
auto expected = Expected; \
auto actual = Actual; \
if ( expected != actual ) \
{ \
std::printf( "test condition failed: Expected %s (%s) != Actual %s (%s) at line %d\n", \
#Expected, std::to_string( expected ).c_str( ), #Actual, \
std::to_string( actual ).c_str( ), __LINE__ ); \
return false; \
} \
} while ( false );
#define DISABLED_TEST_CASES ;
namespace phresal_matcher_lib
{
class TextTokenizer
{
public:
enum class TokenType
{
Regular,
Punctuation,
End
};
struct Token
{
TokenType type;
std::string data;
};
public:
explicit TextTokenizer( const std::string& text,
std::string whitespaces_chars,
std::string punctuation_mark_chars )
{
// place \n as terminator which is delimiter to handle last token.
this->text = text + "\n";
this->offset = 0;
this->whitespaces_chars = std::move( whitespaces_chars );
this->punctuation_mark_chars = std::move( punctuation_mark_chars );
}
void
skip_whitespaces( )
{
this->offset = this->text.find_first_not_of( this->whitespaces_chars, this->offset );
}
void
skip_punctuation_marks( )
{
this->offset = this->text.find_first_not_of( this->punctuation_mark_chars, this->offset );
}
Token
next_token( )
{
if ( !returned_tokens.empty( ) )
{
auto result = returned_tokens.back( );
returned_tokens.pop_back( );
return result;
}
// find next token start position
skip_whitespaces( );
if ( !end( ) )
{
if ( this->punctuation_mark_chars.find( this->text[ this->offset ] )
!= std::string::npos )
{
std::string tok;
tok += this->text[ this->offset ];
this->offset += 1;
return {TokenType::Punctuation, tok};
}
}
if ( !end( ) )
{
size_t token_end_pos
= this->text.find_first_of( TextTokenizer::separators( ), this->offset );
if ( token_end_pos != std::string::npos )
{
std::string token( this->text, this->offset, token_end_pos - this->offset );
this->offset = token_end_pos;
verbose_msg( "yield(%s)", token.c_str( ) );
return {TokenType::Regular, token};
}
}
return {TokenType::End, ""};
}
void
put_token_back( Token token )
{
returned_tokens.push_back( std::move( token ) );
}
bool
end( ) const
{
return this->offset == this->text.size( ) || this->offset == std::string::npos;
}
const std::string&
separators( ) const
{
static std::string separators_chars = whitespaces_chars + punctuation_mark_chars;
return separators_chars;
}
private:
std::list< Token > returned_tokens;
std::string whitespaces_chars, punctuation_mark_chars;
std::string text;
size_t offset;
};
bool
tokenizer_basic_test( )
{
TextTokenizer tokenizer( " Quick brown fox, jumps over the lazy dog!", WHITESPACES,
DELIMITERS );
test_assert( tokenizer.end( ) == false );
{
auto t = tokenizer.next_token( );
test_assert( t.data == "Quick" );
test_assert( t.type == TextTokenizer::TokenType::Regular );
test_assert( !tokenizer.end( ) );
}
test_assert( tokenizer.next_token( ).data == "brown" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "fox" );
test_assert( !tokenizer.end( ) );
{
auto t = tokenizer.next_token( );
test_assert( t.data == "," );
test_assert( t.type == TextTokenizer::TokenType::Punctuation );
test_assert( !tokenizer.end( ) );
}
test_assert( tokenizer.next_token( ).data == "jumps" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "over" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "the" );
test_assert( !tokenizer.end( ) );
tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "the"} );
tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "over"} );
test_assert( tokenizer.next_token( ).data == "over" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "the" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "lazy" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "dog" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "!" );
test_assert( !tokenizer.end( ) );
test_assert( tokenizer.next_token( ).data == "" );
test_assert( tokenizer.end( ) );
return true;
};
enum class DictionaryEntryType
{
verb,
phresal_verb,
invalid,
any = invalid
};
struct DictionaryEntry
{
DictionaryEntryType type;
std::vector< std::string > tokens;
};
class Dictionary
{
public:
using const_iterator = std::vector< DictionaryEntry >::const_iterator;
using const_iterators_range = std::pair< const_iterator, const_iterator >;
public:
explicit Dictionary( std::vector< DictionaryEntry > data )
{
this->data = std::move( data );
std::sort( this->data.begin( ), this->data.end( ), compare_tokens_less );
}
const_iterator
end( ) const
{
return this->data.end( );
}
private:
static bool
compare_tokens_less( const DictionaryEntry& lhs, const DictionaryEntry& rhs )
{
return ( lhs.type < rhs.type ) || ( lhs.type == rhs.type && lhs.tokens < rhs.tokens );
}
static bool
compare_by_first_token( const DictionaryEntry& lhs, const DictionaryEntry& rhs )
{
if ( lhs.type < rhs.type )
{
return true;
}
else if ( lhs.type == rhs.type )
{
if ( lhs.tokens.empty( ) || rhs.tokens.empty( ) )
{
return lhs.tokens.empty( ) < rhs.tokens.empty( );
}
return lhs.tokens[ 0 ] < rhs.tokens[ 0 ];
}
else
{
// lhs.type > rhs.type
return false;
}
}
public:
const_iterator
look_up_phresal( const std::vector< std::string >& tokens ) const
{
if ( tokens.empty( ) )
return end( );
std::string tokens_str;
if ( g_verbose )
{
for ( auto t : tokens )
tokens_str += t + ", ";
tokens_str.resize( tokens_str.size( ) - 2 );
}
DictionaryEntry search_value{DictionaryEntryType::phresal_verb, tokens};
auto lb_it
= std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less );
if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) )
{
if ( g_verbose )
{
verbose_msg( "look_up() succeeded: %s", tokens_str.c_str( ) );
}
return lb_it;
}
if ( g_verbose )
{
verbose_msg( "look_up() failed: %s", tokens_str.c_str( ) );
}
return end( );
}
std::pair< const_iterator, const_iterator >
lookup_by_first_word( const std::string& word ) const
{
DictionaryEntry search_value{DictionaryEntryType::phresal_verb, {word}};
auto lb_it
= std::lower_bound( data.begin( ), data.end( ), search_value, compare_by_first_token );
if ( lb_it != data.end( ) && lb_it->type == DictionaryEntryType::phresal_verb
&& !lb_it->tokens.empty( ) && lb_it->tokens[ 0 ] == word )
{
auto ub_it = std::upper_bound( data.begin( ), data.end( ), search_value,
compare_by_first_token );
return {lb_it, ub_it};
}
else
{
return {end( ), end( )};
}
}
const_iterator
lookup_for_verb( const std::string& word ) const
{
DictionaryEntry search_value{DictionaryEntryType::verb, {word}};
auto lb_it
= std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less );
if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) )
{
return lb_it;
}
return end( );
}
private:
std::vector< DictionaryEntry > data;
};
std::vector< DictionaryEntry >
get_hardcoded_dictionary( )
{
static const std::vector< DictionaryEntry > result = {
{DictionaryEntryType::phresal_verb, {"get", "on"}},
{DictionaryEntryType::phresal_verb, {"get", "off"}},
{DictionaryEntryType::phresal_verb, {"try", "on"}},
{DictionaryEntryType::phresal_verb, {"take", "off"}},
{DictionaryEntryType::phresal_verb, {"look", "up"}},
{DictionaryEntryType::phresal_verb, {"get", "up"}},
{DictionaryEntryType::phresal_verb, {"get", "back"}},
{DictionaryEntryType::phresal_verb, {"pick", "up"}},
};
return result;
}
class PhresalsMatcher
{
public:
struct MatchInfo
{
DictionaryEntry phresal;
size_t column, row, offset;
};
using ReportResultCb = std::function< void( const MatchInfo& match_info ) >;
PhresalsMatcher( TextTokenizer tokenizer, const Dictionary& dictionary, ReportResultCb cb )
: tokenizer( tokenizer )
, dictionary( dictionary )
, cb( cb )
{
}
// TODO: 1) Yield position, row and column.
// 2) check if phresal is split by regular verb e.g `get xxx acquire by' should not match
// `get by` here because in between we have verb. 3) the same as (2) but check for
// punctuation marks.
void
match_all( )
{
std::string first_phresal_token, second_phresal_token;
bool in_phresal = false;
Dictionary::const_iterators_range lookup_range_result{dictionary.end( ), dictionary.end( )};
while ( !tokenizer.end( ) )
{
auto token = tokenizer.next_token( );
Dictionary::const_iterator lookup_result = dictionary.end( );
if ( !in_phresal )
{
verbose_msg( "lookup verb '%s'", token.data.c_str( ) );
lookup_range_result = dictionary.lookup_by_first_word( token.data );
if ( lookup_range_result.first != dictionary.end( ) )
{
unsigned i = 0;
for ( auto it = lookup_range_result.first; it != lookup_range_result.second;
++it )
{
verbose_msg( "%d: lookup verb '%s' SUCCESS: [%s:%s]", i++,
token.data.c_str( ), it->tokens[ 0 ].c_str( ),
it->tokens[ 1 ].c_str( ) );
}
// first word has matched to one or several phresals
in_phresal = true;
}
else
{
verbose_msg( "status == notinphresal, lookup verb '%s' FAILED",
token.data.c_str( ) );
}
}
else
{
// in phresal
// check if current tokens breaks current phresal match. in cat be due to regular
// verb, or due to punctuation mark (simplified heuristics).
if ( token.type == TextTokenizer::TokenType::Punctuation )
{
verbose_msg( "in phresal false due to breaking punctuation mark: [%s]!",
token.data.c_str( ) );
// current match broken
in_phresal = false;
}
else if ( ( lookup_result = dictionary.lookup_for_verb( token.data ) )
!= dictionary.end( ) )
{
assert( lookup_result->type == DictionaryEntryType::verb );
// current match broken
in_phresal = false;
verbose_msg( "in phresal false due to breaking VERB %s!",
lookup_result->tokens[ 0 ].c_str( ) );
// but this verb can be beginning of other phresal, will be handled in next
// iteration.
tokenizer.put_token_back( std::move( token ) );
}
else
{
verbose_msg(
"in phresal, trying matching second word! current token is [%s], there are "
"%d candidatees",
token.data.c_str( ),
(int)std::distance( lookup_range_result.first, lookup_range_result.second ) );
// no, this word does not break our current match, lets try to match second
// token of the phresal.
for ( auto it = lookup_range_result.first; it != lookup_range_result.second;
++it )
{
assert( it->type == DictionaryEntryType::phresal_verb );
assert( it->tokens.size( ) == 2 );
verbose_msg( "testing token: %s", it->tokens[ 1 ].c_str( ) );
if ( token.data == it->tokens[ 1 ] )
{
verbose_msg( "matched second word of phresal" );
cb( MatchInfo{DictionaryEntry{DictionaryEntryType::phresal_verb,
{it->tokens[ 0 ], it->tokens[ 1 ]}},
0, 0, 0} );
in_phresal = false;
break;
}
}
}
}
}
}
private:
TextTokenizer tokenizer;
const Dictionary& dictionary;
ReportResultCb cb;
};
} // namespace phresal_matcher_lib
void
run_tests( )
{
using namespace phresal_matcher_lib;
auto run_all = []( std::vector< std::pair< const char*, std::function< bool( ) > > > tests ) {
std::vector< std::string > passed, failed;
for ( auto test : tests )
{
info_msg( "running test case: %s", test.first );
if ( test.second( ) )
{
info_msg( "%s passed.\n", test.first );
passed.emplace_back( std::string( test.first ) );
}
else
{
failed.emplace_back( std::string( test.first ) );
}
}
info_msg( "*** Summary ***\n" );
info_msg( "Passed tests:" );
for ( auto t : passed )
{
std::printf( "\t%s\n", t.c_str( ) );
}
if ( !failed.empty( ) )
{
info_msg( "Failed tests:" );
for ( auto t : failed )
{
std::printf( "\t%s\n", t.c_str( ) );
}
}
else
{
info_msg( "No failed tests.\n" );
}
};
auto dictionary_basic_test = []( ) -> bool {
Dictionary dict{get_hardcoded_dictionary( )};
test_assert( dict.look_up_phresal( {"look", "up"} ) != dict.end( ) );
test_assert( dict.look_up_phresal( {"look", "up"} )->type
== DictionaryEntryType::phresal_verb );
test_assert( dict.look_up_phresal( {"fuck", "off"} ) == dict.end( ) );
// TODO: test case when queried by first token phresal has multiple choices.
// TODO: test lookup_verb
return true;
};
auto matcher_basic_test = []( ) -> bool {
std::vector< std::string > matched_phresals;
Dictionary dict{{
{DictionaryEntryType::phresal_verb, {"get", "up"}},
{DictionaryEntryType::phresal_verb, {"get", "down"}},
{DictionaryEntryType::phresal_verb, {"try", "on"}},
{DictionaryEntryType::phresal_verb, {"set", "off"}},
{DictionaryEntryType::verb, {"walk"}}, // Used as stopper for set off
{DictionaryEntryType::phresal_verb, {"make", "up"}},
{DictionaryEntryType::phresal_verb, {"make", "off"}},
// Without regular verbs that appear as first words of phresal verbs, nothing will work
// as expected
{DictionaryEntryType::verb, {"walk"}}, // Used as stopper for set off
{DictionaryEntryType::verb, {"get"}},
}};
std::string text
// 1) Regular `get up` match
= "I get up in the monning.\n"
// 2) .. try ... on ... pattern
"I always try new things on.\n"
// 3) Set off is not matched because of `walk` which is regular word
"I set these walk off\n"
// 4) After previous test case, check state is recovered and we still can match cases
// like (2).
"I also make my bed up every day"
"\n"
// 5) .. get XXX get up. Must match latest `get up` (todo: check offset).
"I get XXX get up"
"\n"
// 6) .. get XXX get YYY up. Must match latest `get up` like in case (2).
"I get XXX get things up"
"\n"
// 7) Phresal verb broken by punctuation mark does not count.
"I get . up"
"\n"
// 8) Phresal verb broken by punctuation mark does not count.
"I get ; up"
"\n"
// 9) Phresal verb broken by punctuation mark does not count.
"I get, up"
"\n"
// 10) Phresal verb broken by punctuation mark does not count.
"I get ! up"
"\n";
TextTokenizer tokenizer( text, WHITESPACES, DELIMITERS );
PhresalsMatcher matcher{tokenizer, dict,
[&matched_phresals]( const PhresalsMatcher::MatchInfo& info ) {
verbose_msg( "match" );
matched_phresals.push_back( info.phresal.tokens[ 0 ] + " "
+ info.phresal.tokens[ 1 ] );
}};
matcher.match_all( );
test_eq( 5, matched_phresals.size( ) );
verbose_msg( "\nmatched phresals:" );
for ( auto phresal : matched_phresals )
{
verbose_msg( "%s", phresal.c_str( ) );
}
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" )
!= matched_phresals.end( ) );
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "try on" )
!= matched_phresals.end( ) );
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "set off" )
== matched_phresals.end( ) );
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "make up" )
!= matched_phresals.end( ) );
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" )
!= matched_phresals.end( ) );
return true;
};
run_all( {
{"tokenizer_basic_test", tokenizer_basic_test},
{"dictionary_basic_test", dictionary_basic_test},
{"matcher_basic_test", matcher_basic_test},
// { "file_loader_test", file_loader_test },
// { "integration_file_tokenizing_test", integration_file_tokenizing_test }
} );
}
namespace application
{
bool
parse_command_line_options( int argc,
char* argv[],
std::string& out_file,
std::string& out_dict,
bool& out_verbose,
bool& out_tests )
{
if ( argc < 2 )
{
err_msg( "In non test mode at least two options needed: input file and dictionary.\n" );
return false;
}
if ( argc == 2 )
{
if ( std::string( argv[ 1 ] ) == "-t" )
{
out_tests = true;
out_verbose = false;
return true;
}
else
{
err_msg( "At least two options needed: input file and dictionary.\n" );
return false;
}
}
bool only_flags_accepted = false;
std::string file, dict;
bool verbose = false, tests = false;
for ( size_t argn = 1U; argn < argc; ++argn )
{
if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-f" )
{
if ( argn + 1 < argc )
{
file = argv[ argn + 1 ];
}
else
{
err_msg( "-f should follow file path.\n" );
return false;
}
}
else if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-d" )
{
if ( argn + 1 < argc )
{
dict = argv[ argn + 1 ];
}
else
{
err_msg( "-d should follow dictionary file path.\n" );
return false;
}
}
else if ( std::string( argv[ argn ] ) == "-v" )
{
verbose = true;
}
else if ( std::string( argv[ argn ] ) == "-t" )
{
tests = true;
}
if ( !dict.empty( ) && !file.empty( ) )
{
only_flags_accepted = true;
}
}
if ( tests )
{
if ( !dict.empty( ) || !file.empty( ) )
{
warn_msg( "dict and file options are ingored in test mode\n" );
}
out_verbose = verbose;
out_tests = true;
return true;
}
else if ( !dict.empty( ) && !file.empty( ) )
{
out_file = std::move( file );
out_dict = std::move( dict );
out_verbose = verbose;
out_tests = false;
return true;
}
else
{
err_msg(
"-f and -d are mandatory command line options in non-test run. Please specify them or "
"specify test mode\n" );
return false;
}
}
bool
load_text_from_file( const std::string& file_path, std::string& out_text )
{
std::ifstream file_stream( file_path, std::ios::in | std::ios::in );
if ( !file_stream )
{
return false;
}
file_stream.seekg( 0, file_stream.end );
size_t file_size = file_stream.tellg( );
file_stream.seekg( 0, file_stream.beg );
std::string text( file_size, '\0' );
file_stream.read( const_cast< char* >( text.data( ) ), file_size );
if ( file_stream )
{
out_text = text;
verbose_msg( "read text: %s", text.c_str( ) );
return true;
}
return false;
}
namespace
{
std::string
file_name_from_file_path( std::string fpath )
{
size_t pos = fpath.find_last_of( '\\' );
if ( pos != std::string::npos )
{
return std::string( fpath, pos + 1, fpath.size( ) - pos );
}
return fpath;
}
}
void
show_usage( std::string self_name )
{
std::printf( "Phresal verb finder. (c) lsem, 2017\n" );
std::printf( "USAGE:\n" );
std::printf( "%3s%s OPTIONS\n\n", "", self_name.c_str( ) );
std::printf( "Available options:\n" );
const char* option_format_str = "%5s%-5s%-15s%-35s\n";
std::printf( option_format_str, "-f", "", "FILE PATH", "Path to input file." );
std::printf( option_format_str, "-d", "", "FILE PATH", "Path to dictionary file." );
std::printf( option_format_str, "-t", "", "",
"Flag makes the app run tests instead of running app." );
std::printf( option_format_str, "-v", "", "", "Flag enables verbose mode." );
}
}
int
main( int argc, char* argv[] )
{
std::string file_path, dict_file_path;
bool verbose, tests;
if ( !application::parse_command_line_options( argc, argv,
/*out*/ file_path,
/*out*/ dict_file_path,
/*out*/ verbose,
/*out*/ tests ) )
{
application::show_usage( application::file_name_from_file_path( argv[ 0 ] ) );
return 1;
}
g_verbose = verbose;
if ( tests )
{
run_tests( );
return 0;
}
else
{
verbose_msg( "effective file path `%s`", file_path.c_str( ) );
verbose_msg( "effective dict path: `%s`", dict_file_path.c_str( ) );
verbose_msg( "Loading text from file `%s`", file_path.c_str( ) );
std::string text;
if ( !application::load_text_from_file( file_path, text ) )
{
err_msg( "failed loading text from file" );
return 1;
}
auto match_cb = []( const phresal_matcher_lib::PhresalsMatcher::MatchInfo& /*info*/ ) {
info_msg( "Found phresal verb at position %d", -1 );
};
phresal_matcher_lib::TextTokenizer tokenizer{text, WHITESPACES, DELIMITERS};
phresal_matcher_lib::Dictionary dict{phresal_matcher_lib::get_hardcoded_dictionary( )};
phresal_matcher_lib::PhresalsMatcher matcher{tokenizer, dict, match_cb};
matcher.match_all( );
return 0;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment