Last active
January 3, 2018 10:16
-
-
Save lsem/2fb600bd67f557a0ff22f014410cf2d6 to your computer and use it in GitHub Desktop.
PhresalsFinder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <cassert> | |
#include <fstream> | |
#include <functional> | |
#include <iostream> | |
#include <list> | |
#include <map> | |
#include <set> | |
#include <string> | |
#include <vector> | |
// Make all asserts runtime. | |
#ifdef _NDEBUG | |
#undef _NDEBUG | |
#endif | |
static const char* const WHITESPACES = "\t \n\0"; | |
static const char* const DELIMITERS = ",.;?!"; | |
namespace | |
{ | |
bool g_verbose = true; | |
#define verbose_msg( msg, ... ) \ | |
do \ | |
{ \ | |
if ( g_verbose ) \ | |
std::printf( "VRB: " msg "\n", __VA_ARGS__ ); \ | |
} while ( false ) | |
#define err_msg( msg, ... ) std::fprintf( stderr, "ERR: " msg "\n", __VA_ARGS__ ); | |
#define warn_msg( msg, ... ) std::printf( "WARN: " msg "\n", __VA_ARGS__ ); | |
#define info_msg( msg, ... ) std::printf( "" msg "\n", __VA_ARGS__ ); | |
} | |
#define test_assert( Condition ) \ | |
do \ | |
{ \ | |
if ( !( Condition ) ) \ | |
{ \ | |
std::printf( "test condition failed: `%s` at line %d\n", #Condition, __LINE__ ); \ | |
return false; \ | |
} \ | |
} while ( false ); | |
#define test_eq( Expected, Actual ) \ | |
do \ | |
{ \ | |
auto expected = Expected; \ | |
auto actual = Actual; \ | |
if ( expected != actual ) \ | |
{ \ | |
std::printf( "test condition failed: Expected %s (%s) != Actual %s (%s) at line %d\n", \ | |
#Expected, std::to_string( expected ).c_str( ), #Actual, \ | |
std::to_string( actual ).c_str( ), __LINE__ ); \ | |
return false; \ | |
} \ | |
} while ( false ); | |
#define DISABLED_TEST_CASES ; | |
namespace phresal_matcher_lib | |
{ | |
class TextTokenizer | |
{ | |
public: | |
enum class TokenType | |
{ | |
Regular, | |
Punctuation, | |
End | |
}; | |
struct Token | |
{ | |
TokenType type; | |
std::string data; | |
}; | |
public: | |
explicit TextTokenizer( const std::string& text, | |
std::string whitespaces_chars, | |
std::string punctuation_mark_chars ) | |
{ | |
// place \n as terminator which is delimiter to handle last token. | |
this->text = text + "\n"; | |
this->offset = 0; | |
this->whitespaces_chars = std::move( whitespaces_chars ); | |
this->punctuation_mark_chars = std::move( punctuation_mark_chars ); | |
} | |
void | |
skip_whitespaces( ) | |
{ | |
this->offset = this->text.find_first_not_of( this->whitespaces_chars, this->offset ); | |
} | |
void | |
skip_punctuation_marks( ) | |
{ | |
this->offset = this->text.find_first_not_of( this->punctuation_mark_chars, this->offset ); | |
} | |
Token | |
next_token( ) | |
{ | |
if ( !returned_tokens.empty( ) ) | |
{ | |
auto result = returned_tokens.back( ); | |
returned_tokens.pop_back( ); | |
return result; | |
} | |
// find next token start position | |
skip_whitespaces( ); | |
if ( !end( ) ) | |
{ | |
if ( this->punctuation_mark_chars.find( this->text[ this->offset ] ) | |
!= std::string::npos ) | |
{ | |
std::string tok; | |
tok += this->text[ this->offset ]; | |
this->offset += 1; | |
return {TokenType::Punctuation, tok}; | |
} | |
} | |
if ( !end( ) ) | |
{ | |
size_t token_end_pos | |
= this->text.find_first_of( TextTokenizer::separators( ), this->offset ); | |
if ( token_end_pos != std::string::npos ) | |
{ | |
std::string token( this->text, this->offset, token_end_pos - this->offset ); | |
this->offset = token_end_pos; | |
verbose_msg( "yield(%s)", token.c_str( ) ); | |
return {TokenType::Regular, token}; | |
} | |
} | |
return {TokenType::End, ""}; | |
} | |
void | |
put_token_back( Token token ) | |
{ | |
returned_tokens.push_back( std::move( token ) ); | |
} | |
bool | |
end( ) const | |
{ | |
return this->offset == this->text.size( ) || this->offset == std::string::npos; | |
} | |
const std::string& | |
separators( ) const | |
{ | |
static std::string separators_chars = whitespaces_chars + punctuation_mark_chars; | |
return separators_chars; | |
} | |
private: | |
std::list< Token > returned_tokens; | |
std::string whitespaces_chars, punctuation_mark_chars; | |
std::string text; | |
size_t offset; | |
}; | |
bool | |
tokenizer_basic_test( ) | |
{ | |
TextTokenizer tokenizer( " Quick brown fox, jumps over the lazy dog!", WHITESPACES, | |
DELIMITERS ); | |
test_assert( tokenizer.end( ) == false ); | |
{ | |
auto t = tokenizer.next_token( ); | |
test_assert( t.data == "Quick" ); | |
test_assert( t.type == TextTokenizer::TokenType::Regular ); | |
test_assert( !tokenizer.end( ) ); | |
} | |
test_assert( tokenizer.next_token( ).data == "brown" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "fox" ); | |
test_assert( !tokenizer.end( ) ); | |
{ | |
auto t = tokenizer.next_token( ); | |
test_assert( t.data == "," ); | |
test_assert( t.type == TextTokenizer::TokenType::Punctuation ); | |
test_assert( !tokenizer.end( ) ); | |
} | |
test_assert( tokenizer.next_token( ).data == "jumps" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "over" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "the" ); | |
test_assert( !tokenizer.end( ) ); | |
tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "the"} ); | |
tokenizer.put_token_back( {TextTokenizer::TokenType::Regular, "over"} ); | |
test_assert( tokenizer.next_token( ).data == "over" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "the" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "lazy" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "dog" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "!" ); | |
test_assert( !tokenizer.end( ) ); | |
test_assert( tokenizer.next_token( ).data == "" ); | |
test_assert( tokenizer.end( ) ); | |
return true; | |
}; | |
enum class DictionaryEntryType | |
{ | |
verb, | |
phresal_verb, | |
invalid, | |
any = invalid | |
}; | |
struct DictionaryEntry | |
{ | |
DictionaryEntryType type; | |
std::vector< std::string > tokens; | |
}; | |
class Dictionary | |
{ | |
public: | |
using const_iterator = std::vector< DictionaryEntry >::const_iterator; | |
using const_iterators_range = std::pair< const_iterator, const_iterator >; | |
public: | |
explicit Dictionary( std::vector< DictionaryEntry > data ) | |
{ | |
this->data = std::move( data ); | |
std::sort( this->data.begin( ), this->data.end( ), compare_tokens_less ); | |
} | |
const_iterator | |
end( ) const | |
{ | |
return this->data.end( ); | |
} | |
private: | |
static bool | |
compare_tokens_less( const DictionaryEntry& lhs, const DictionaryEntry& rhs ) | |
{ | |
return ( lhs.type < rhs.type ) || ( lhs.type == rhs.type && lhs.tokens < rhs.tokens ); | |
} | |
static bool | |
compare_by_first_token( const DictionaryEntry& lhs, const DictionaryEntry& rhs ) | |
{ | |
if ( lhs.type < rhs.type ) | |
{ | |
return true; | |
} | |
else if ( lhs.type == rhs.type ) | |
{ | |
if ( lhs.tokens.empty( ) || rhs.tokens.empty( ) ) | |
{ | |
return lhs.tokens.empty( ) < rhs.tokens.empty( ); | |
} | |
return lhs.tokens[ 0 ] < rhs.tokens[ 0 ]; | |
} | |
else | |
{ | |
// lhs.type > rhs.type | |
return false; | |
} | |
} | |
public: | |
const_iterator | |
look_up_phresal( const std::vector< std::string >& tokens ) const | |
{ | |
if ( tokens.empty( ) ) | |
return end( ); | |
std::string tokens_str; | |
if ( g_verbose ) | |
{ | |
for ( auto t : tokens ) | |
tokens_str += t + ", "; | |
tokens_str.resize( tokens_str.size( ) - 2 ); | |
} | |
DictionaryEntry search_value{DictionaryEntryType::phresal_verb, tokens}; | |
auto lb_it | |
= std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less ); | |
if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) ) | |
{ | |
if ( g_verbose ) | |
{ | |
verbose_msg( "look_up() succeeded: %s", tokens_str.c_str( ) ); | |
} | |
return lb_it; | |
} | |
if ( g_verbose ) | |
{ | |
verbose_msg( "look_up() failed: %s", tokens_str.c_str( ) ); | |
} | |
return end( ); | |
} | |
std::pair< const_iterator, const_iterator > | |
lookup_by_first_word( const std::string& word ) const | |
{ | |
DictionaryEntry search_value{DictionaryEntryType::phresal_verb, {word}}; | |
auto lb_it | |
= std::lower_bound( data.begin( ), data.end( ), search_value, compare_by_first_token ); | |
if ( lb_it != data.end( ) && lb_it->type == DictionaryEntryType::phresal_verb | |
&& !lb_it->tokens.empty( ) && lb_it->tokens[ 0 ] == word ) | |
{ | |
auto ub_it = std::upper_bound( data.begin( ), data.end( ), search_value, | |
compare_by_first_token ); | |
return {lb_it, ub_it}; | |
} | |
else | |
{ | |
return {end( ), end( )}; | |
} | |
} | |
const_iterator | |
lookup_for_verb( const std::string& word ) const | |
{ | |
DictionaryEntry search_value{DictionaryEntryType::verb, {word}}; | |
auto lb_it | |
= std::lower_bound( data.begin( ), data.end( ), search_value, compare_tokens_less ); | |
if ( lb_it != data.end( ) && !compare_tokens_less( search_value, *lb_it ) ) | |
{ | |
return lb_it; | |
} | |
return end( ); | |
} | |
private: | |
std::vector< DictionaryEntry > data; | |
}; | |
std::vector< DictionaryEntry > | |
get_hardcoded_dictionary( ) | |
{ | |
static const std::vector< DictionaryEntry > result = { | |
{DictionaryEntryType::phresal_verb, {"get", "on"}}, | |
{DictionaryEntryType::phresal_verb, {"get", "off"}}, | |
{DictionaryEntryType::phresal_verb, {"try", "on"}}, | |
{DictionaryEntryType::phresal_verb, {"take", "off"}}, | |
{DictionaryEntryType::phresal_verb, {"look", "up"}}, | |
{DictionaryEntryType::phresal_verb, {"get", "up"}}, | |
{DictionaryEntryType::phresal_verb, {"get", "back"}}, | |
{DictionaryEntryType::phresal_verb, {"pick", "up"}}, | |
}; | |
return result; | |
} | |
class PhresalsMatcher | |
{ | |
public: | |
struct MatchInfo | |
{ | |
DictionaryEntry phresal; | |
size_t column, row, offset; | |
}; | |
using ReportResultCb = std::function< void( const MatchInfo& match_info ) >; | |
PhresalsMatcher( TextTokenizer tokenizer, const Dictionary& dictionary, ReportResultCb cb ) | |
: tokenizer( tokenizer ) | |
, dictionary( dictionary ) | |
, cb( cb ) | |
{ | |
} | |
// TODO: 1) Yield position, row and column. | |
// 2) check if phresal is split by regular verb e.g `get xxx acquire by' should not match | |
// `get by` here because in between we have verb. 3) the same as (2) but check for | |
// punctuation marks. | |
void | |
match_all( ) | |
{ | |
std::string first_phresal_token, second_phresal_token; | |
bool in_phresal = false; | |
Dictionary::const_iterators_range lookup_range_result{dictionary.end( ), dictionary.end( )}; | |
while ( !tokenizer.end( ) ) | |
{ | |
auto token = tokenizer.next_token( ); | |
Dictionary::const_iterator lookup_result = dictionary.end( ); | |
if ( !in_phresal ) | |
{ | |
verbose_msg( "lookup verb '%s'", token.data.c_str( ) ); | |
lookup_range_result = dictionary.lookup_by_first_word( token.data ); | |
if ( lookup_range_result.first != dictionary.end( ) ) | |
{ | |
unsigned i = 0; | |
for ( auto it = lookup_range_result.first; it != lookup_range_result.second; | |
++it ) | |
{ | |
verbose_msg( "%d: lookup verb '%s' SUCCESS: [%s:%s]", i++, | |
token.data.c_str( ), it->tokens[ 0 ].c_str( ), | |
it->tokens[ 1 ].c_str( ) ); | |
} | |
// first word has matched to one or several phresals | |
in_phresal = true; | |
} | |
else | |
{ | |
verbose_msg( "status == notinphresal, lookup verb '%s' FAILED", | |
token.data.c_str( ) ); | |
} | |
} | |
else | |
{ | |
// in phresal | |
// check if current tokens breaks current phresal match. in cat be due to regular | |
// verb, or due to punctuation mark (simplified heuristics). | |
if ( token.type == TextTokenizer::TokenType::Punctuation ) | |
{ | |
verbose_msg( "in phresal false due to breaking punctuation mark: [%s]!", | |
token.data.c_str( ) ); | |
// current match broken | |
in_phresal = false; | |
} | |
else if ( ( lookup_result = dictionary.lookup_for_verb( token.data ) ) | |
!= dictionary.end( ) ) | |
{ | |
assert( lookup_result->type == DictionaryEntryType::verb ); | |
// current match broken | |
in_phresal = false; | |
verbose_msg( "in phresal false due to breaking VERB %s!", | |
lookup_result->tokens[ 0 ].c_str( ) ); | |
// but this verb can be beginning of other phresal, will be handled in next | |
// iteration. | |
tokenizer.put_token_back( std::move( token ) ); | |
} | |
else | |
{ | |
verbose_msg( | |
"in phresal, trying matching second word! current token is [%s], there are " | |
"%d candidatees", | |
token.data.c_str( ), | |
(int)std::distance( lookup_range_result.first, lookup_range_result.second ) ); | |
// no, this word does not break our current match, lets try to match second | |
// token of the phresal. | |
for ( auto it = lookup_range_result.first; it != lookup_range_result.second; | |
++it ) | |
{ | |
assert( it->type == DictionaryEntryType::phresal_verb ); | |
assert( it->tokens.size( ) == 2 ); | |
verbose_msg( "testing token: %s", it->tokens[ 1 ].c_str( ) ); | |
if ( token.data == it->tokens[ 1 ] ) | |
{ | |
verbose_msg( "matched second word of phresal" ); | |
cb( MatchInfo{DictionaryEntry{DictionaryEntryType::phresal_verb, | |
{it->tokens[ 0 ], it->tokens[ 1 ]}}, | |
0, 0, 0} ); | |
in_phresal = false; | |
break; | |
} | |
} | |
} | |
} | |
} | |
} | |
private: | |
TextTokenizer tokenizer; | |
const Dictionary& dictionary; | |
ReportResultCb cb; | |
}; | |
} // namespace phresal_matcher_lib | |
void | |
run_tests( ) | |
{ | |
using namespace phresal_matcher_lib; | |
auto run_all = []( std::vector< std::pair< const char*, std::function< bool( ) > > > tests ) { | |
std::vector< std::string > passed, failed; | |
for ( auto test : tests ) | |
{ | |
info_msg( "running test case: %s", test.first ); | |
if ( test.second( ) ) | |
{ | |
info_msg( "%s passed.\n", test.first ); | |
passed.emplace_back( std::string( test.first ) ); | |
} | |
else | |
{ | |
failed.emplace_back( std::string( test.first ) ); | |
} | |
} | |
info_msg( "*** Summary ***\n" ); | |
info_msg( "Passed tests:" ); | |
for ( auto t : passed ) | |
{ | |
std::printf( "\t%s\n", t.c_str( ) ); | |
} | |
if ( !failed.empty( ) ) | |
{ | |
info_msg( "Failed tests:" ); | |
for ( auto t : failed ) | |
{ | |
std::printf( "\t%s\n", t.c_str( ) ); | |
} | |
} | |
else | |
{ | |
info_msg( "No failed tests.\n" ); | |
} | |
}; | |
auto dictionary_basic_test = []( ) -> bool { | |
Dictionary dict{get_hardcoded_dictionary( )}; | |
test_assert( dict.look_up_phresal( {"look", "up"} ) != dict.end( ) ); | |
test_assert( dict.look_up_phresal( {"look", "up"} )->type | |
== DictionaryEntryType::phresal_verb ); | |
test_assert( dict.look_up_phresal( {"fuck", "off"} ) == dict.end( ) ); | |
// TODO: test case when queried by first token phresal has multiple choices. | |
// TODO: test lookup_verb | |
return true; | |
}; | |
auto matcher_basic_test = []( ) -> bool { | |
std::vector< std::string > matched_phresals; | |
Dictionary dict{{ | |
{DictionaryEntryType::phresal_verb, {"get", "up"}}, | |
{DictionaryEntryType::phresal_verb, {"get", "down"}}, | |
{DictionaryEntryType::phresal_verb, {"try", "on"}}, | |
{DictionaryEntryType::phresal_verb, {"set", "off"}}, | |
{DictionaryEntryType::verb, {"walk"}}, // Used as stopper for set off | |
{DictionaryEntryType::phresal_verb, {"make", "up"}}, | |
{DictionaryEntryType::phresal_verb, {"make", "off"}}, | |
// Without regular verbs that appear as first words of phresal verbs, nothing will work | |
// as expected | |
{DictionaryEntryType::verb, {"walk"}}, // Used as stopper for set off | |
{DictionaryEntryType::verb, {"get"}}, | |
}}; | |
std::string text | |
// 1) Regular `get up` match | |
= "I get up in the monning.\n" | |
// 2) .. try ... on ... pattern | |
"I always try new things on.\n" | |
// 3) Set off is not matched because of `walk` which is regular word | |
"I set these walk off\n" | |
// 4) After previous test case, check state is recovered and we still can match cases | |
// like (2). | |
"I also make my bed up every day" | |
"\n" | |
// 5) .. get XXX get up. Must match latest `get up` (todo: check offset). | |
"I get XXX get up" | |
"\n" | |
// 6) .. get XXX get YYY up. Must match latest `get up` like in case (2). | |
"I get XXX get things up" | |
"\n" | |
// 7) Phresal verb broken by punctuation mark does not count. | |
"I get . up" | |
"\n" | |
// 8) Phresal verb broken by punctuation mark does not count. | |
"I get ; up" | |
"\n" | |
// 9) Phresal verb broken by punctuation mark does not count. | |
"I get, up" | |
"\n" | |
// 10) Phresal verb broken by punctuation mark does not count. | |
"I get ! up" | |
"\n"; | |
TextTokenizer tokenizer( text, WHITESPACES, DELIMITERS ); | |
PhresalsMatcher matcher{tokenizer, dict, | |
[&matched_phresals]( const PhresalsMatcher::MatchInfo& info ) { | |
verbose_msg( "match" ); | |
matched_phresals.push_back( info.phresal.tokens[ 0 ] + " " | |
+ info.phresal.tokens[ 1 ] ); | |
}}; | |
matcher.match_all( ); | |
test_eq( 5, matched_phresals.size( ) ); | |
verbose_msg( "\nmatched phresals:" ); | |
for ( auto phresal : matched_phresals ) | |
{ | |
verbose_msg( "%s", phresal.c_str( ) ); | |
} | |
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" ) | |
!= matched_phresals.end( ) ); | |
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "try on" ) | |
!= matched_phresals.end( ) ); | |
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "set off" ) | |
== matched_phresals.end( ) ); | |
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "make up" ) | |
!= matched_phresals.end( ) ); | |
test_assert( std::find( matched_phresals.begin( ), matched_phresals.end( ), "get up" ) | |
!= matched_phresals.end( ) ); | |
return true; | |
}; | |
run_all( { | |
{"tokenizer_basic_test", tokenizer_basic_test}, | |
{"dictionary_basic_test", dictionary_basic_test}, | |
{"matcher_basic_test", matcher_basic_test}, | |
// { "file_loader_test", file_loader_test }, | |
// { "integration_file_tokenizing_test", integration_file_tokenizing_test } | |
} ); | |
} | |
namespace application | |
{ | |
bool | |
parse_command_line_options( int argc, | |
char* argv[], | |
std::string& out_file, | |
std::string& out_dict, | |
bool& out_verbose, | |
bool& out_tests ) | |
{ | |
if ( argc < 2 ) | |
{ | |
err_msg( "In non test mode at least two options needed: input file and dictionary.\n" ); | |
return false; | |
} | |
if ( argc == 2 ) | |
{ | |
if ( std::string( argv[ 1 ] ) == "-t" ) | |
{ | |
out_tests = true; | |
out_verbose = false; | |
return true; | |
} | |
else | |
{ | |
err_msg( "At least two options needed: input file and dictionary.\n" ); | |
return false; | |
} | |
} | |
bool only_flags_accepted = false; | |
std::string file, dict; | |
bool verbose = false, tests = false; | |
for ( size_t argn = 1U; argn < argc; ++argn ) | |
{ | |
if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-f" ) | |
{ | |
if ( argn + 1 < argc ) | |
{ | |
file = argv[ argn + 1 ]; | |
} | |
else | |
{ | |
err_msg( "-f should follow file path.\n" ); | |
return false; | |
} | |
} | |
else if ( !only_flags_accepted && std::string( argv[ argn ] ) == "-d" ) | |
{ | |
if ( argn + 1 < argc ) | |
{ | |
dict = argv[ argn + 1 ]; | |
} | |
else | |
{ | |
err_msg( "-d should follow dictionary file path.\n" ); | |
return false; | |
} | |
} | |
else if ( std::string( argv[ argn ] ) == "-v" ) | |
{ | |
verbose = true; | |
} | |
else if ( std::string( argv[ argn ] ) == "-t" ) | |
{ | |
tests = true; | |
} | |
if ( !dict.empty( ) && !file.empty( ) ) | |
{ | |
only_flags_accepted = true; | |
} | |
} | |
if ( tests ) | |
{ | |
if ( !dict.empty( ) || !file.empty( ) ) | |
{ | |
warn_msg( "dict and file options are ingored in test mode\n" ); | |
} | |
out_verbose = verbose; | |
out_tests = true; | |
return true; | |
} | |
else if ( !dict.empty( ) && !file.empty( ) ) | |
{ | |
out_file = std::move( file ); | |
out_dict = std::move( dict ); | |
out_verbose = verbose; | |
out_tests = false; | |
return true; | |
} | |
else | |
{ | |
err_msg( | |
"-f and -d are mandatory command line options in non-test run. Please specify them or " | |
"specify test mode\n" ); | |
return false; | |
} | |
} | |
bool | |
load_text_from_file( const std::string& file_path, std::string& out_text ) | |
{ | |
std::ifstream file_stream( file_path, std::ios::in | std::ios::in ); | |
if ( !file_stream ) | |
{ | |
return false; | |
} | |
file_stream.seekg( 0, file_stream.end ); | |
size_t file_size = file_stream.tellg( ); | |
file_stream.seekg( 0, file_stream.beg ); | |
std::string text( file_size, '\0' ); | |
file_stream.read( const_cast< char* >( text.data( ) ), file_size ); | |
if ( file_stream ) | |
{ | |
out_text = text; | |
verbose_msg( "read text: %s", text.c_str( ) ); | |
return true; | |
} | |
return false; | |
} | |
namespace | |
{ | |
std::string | |
file_name_from_file_path( std::string fpath ) | |
{ | |
size_t pos = fpath.find_last_of( '\\' ); | |
if ( pos != std::string::npos ) | |
{ | |
return std::string( fpath, pos + 1, fpath.size( ) - pos ); | |
} | |
return fpath; | |
} | |
} | |
void | |
show_usage( std::string self_name ) | |
{ | |
std::printf( "Phresal verb finder. (c) lsem, 2017\n" ); | |
std::printf( "USAGE:\n" ); | |
std::printf( "%3s%s OPTIONS\n\n", "", self_name.c_str( ) ); | |
std::printf( "Available options:\n" ); | |
const char* option_format_str = "%5s%-5s%-15s%-35s\n"; | |
std::printf( option_format_str, "-f", "", "FILE PATH", "Path to input file." ); | |
std::printf( option_format_str, "-d", "", "FILE PATH", "Path to dictionary file." ); | |
std::printf( option_format_str, "-t", "", "", | |
"Flag makes the app run tests instead of running app." ); | |
std::printf( option_format_str, "-v", "", "", "Flag enables verbose mode." ); | |
} | |
} | |
int | |
main( int argc, char* argv[] ) | |
{ | |
std::string file_path, dict_file_path; | |
bool verbose, tests; | |
if ( !application::parse_command_line_options( argc, argv, | |
/*out*/ file_path, | |
/*out*/ dict_file_path, | |
/*out*/ verbose, | |
/*out*/ tests ) ) | |
{ | |
application::show_usage( application::file_name_from_file_path( argv[ 0 ] ) ); | |
return 1; | |
} | |
g_verbose = verbose; | |
if ( tests ) | |
{ | |
run_tests( ); | |
return 0; | |
} | |
else | |
{ | |
verbose_msg( "effective file path `%s`", file_path.c_str( ) ); | |
verbose_msg( "effective dict path: `%s`", dict_file_path.c_str( ) ); | |
verbose_msg( "Loading text from file `%s`", file_path.c_str( ) ); | |
std::string text; | |
if ( !application::load_text_from_file( file_path, text ) ) | |
{ | |
err_msg( "failed loading text from file" ); | |
return 1; | |
} | |
auto match_cb = []( const phresal_matcher_lib::PhresalsMatcher::MatchInfo& /*info*/ ) { | |
info_msg( "Found phresal verb at position %d", -1 ); | |
}; | |
phresal_matcher_lib::TextTokenizer tokenizer{text, WHITESPACES, DELIMITERS}; | |
phresal_matcher_lib::Dictionary dict{phresal_matcher_lib::get_hardcoded_dictionary( )}; | |
phresal_matcher_lib::PhresalsMatcher matcher{tokenizer, dict, match_cb}; | |
matcher.match_all( ); | |
return 0; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment