Created
July 26, 2023 09:24
-
-
Save icecoobe/269688b71a2c3adc2c4183f87b6153b8 to your computer and use it in GitHub Desktop.
calculate similarity of 2 sentences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <iterator> | |
#include <algorithm> | |
#include <sstream> | |
#include <unordered_set> | |
namespace | |
{ | |
void to_lower(std::string& str) | |
{ | |
std::transform(str.begin(), str.end(), | |
str.begin(), | |
[](unsigned char c) { return std::tolower(c); }); | |
} | |
void remove_punctuation(std::string& str) | |
{ | |
str.erase(std::remove_if(str.begin(), str.end(), [](char c) | |
{ return std::ispunct(c); }), str.end()); | |
} | |
} // namespace {} | |
double sentence_similarity(const std::string& str1, const std::string& str2) | |
{ | |
auto s1 = str1; | |
auto s2 = str2; | |
to_lower(s1); | |
to_lower(s2); | |
remove_punctuation(s1); | |
remove_punctuation(s2); | |
std::istringstream iss1(s1); | |
std::istringstream iss2(s2); | |
std::unordered_set<std::string> words1(std::istream_iterator<std::string>{iss1}, | |
std::istream_iterator<std::string>{}); | |
std::unordered_set<std::string> words2(std::istream_iterator<std::string>{iss2}, | |
std::istream_iterator<std::string>{}); | |
std::unordered_set<std::string> intersection; | |
std::unordered_set<std::string> union_set; | |
for (const auto& word : words1) { | |
union_set.insert(word); | |
if (words2.count(word) > 0) { | |
intersection.insert(word); | |
} | |
} | |
for (const auto& word : words2) { | |
union_set.insert(word); | |
} | |
double similarity = static_cast<double>(intersection.size()) / static_cast<double>(union_set.size()); | |
return similarity; | |
} | |
int main() { | |
std::string s1 = "This is is a test sentence."; | |
std::string s2 = "this sentence is a test."; | |
double similarity = sentence_similarity(s1, s2); | |
std::cout << "Similarity: " << similarity << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
original bug version:
`#include
#include
#include
#include
#include
using words_type = std::vectorstd::string;
namespace
{
inline void to_lower(std::string& str)
{
std::transform(str.begin(), str.end(),
str.begin(),
[](unsigned char c) { return std::tolower(c); });
}
inline void remove_punctuation(std::string& str)
{
str.erase(std::remove_if(str.begin(), str.end(), [](char c)
{ return std::ispunct(c); }), str.end());
}
words_type retrieve_words(const std::string& sentence)
{
std::vectorstd::string words;
std::istringstream iss(sentence);
std::string word;
}
words_type get_intersection(const words_type& s1, const words_type& s2)
{
words_type inter;
std::set_intersection(s1.begin(), s1.end(),
s2.begin(), s2.end(),
std::back_inserter(inter));
return inter;
}
words_type get_union(const words_type& s1, const words_type& s2)
{
words_type union_vector;
std::set_union(s1.begin(), s1.end(),
s2.begin(), s2.end(),
std::back_inserter(union_vector));
return union_vector;
}
} // namespace {}
float calc_similarity(const std::string& s1, const std::string& s2)
{
auto temp1 = s1;
auto temp2 = s2;
}
int main()
{
std::string str = "This is a sentence. It contains punctuation marks!";
std::string str2 = "his is a sentence. It contains punctuation marks!";
}`