Skip to content

Instantly share code, notes, and snippets.

@icecoobe
Created July 26, 2023 09:24
Show Gist options
  • Save icecoobe/269688b71a2c3adc2c4183f87b6153b8 to your computer and use it in GitHub Desktop.
Save icecoobe/269688b71a2c3adc2c4183f87b6153b8 to your computer and use it in GitHub Desktop.
calculate similarity of 2 sentences
#include <iostream>
#include <iterator>
#include <algorithm>
#include <sstream>
#include <unordered_set>
namespace
{
void to_lower(std::string& str)
{
std::transform(str.begin(), str.end(),
str.begin(),
[](unsigned char c) { return std::tolower(c); });
}
void remove_punctuation(std::string& str)
{
str.erase(std::remove_if(str.begin(), str.end(), [](char c)
{ return std::ispunct(c); }), str.end());
}
} // namespace {}
double sentence_similarity(const std::string& str1, const std::string& str2)
{
auto s1 = str1;
auto s2 = str2;
to_lower(s1);
to_lower(s2);
remove_punctuation(s1);
remove_punctuation(s2);
std::istringstream iss1(s1);
std::istringstream iss2(s2);
std::unordered_set<std::string> words1(std::istream_iterator<std::string>{iss1},
std::istream_iterator<std::string>{});
std::unordered_set<std::string> words2(std::istream_iterator<std::string>{iss2},
std::istream_iterator<std::string>{});
std::unordered_set<std::string> intersection;
std::unordered_set<std::string> union_set;
for (const auto& word : words1) {
union_set.insert(word);
if (words2.count(word) > 0) {
intersection.insert(word);
}
}
for (const auto& word : words2) {
union_set.insert(word);
}
double similarity = static_cast<double>(intersection.size()) / static_cast<double>(union_set.size());
return similarity;
}
int main() {
std::string s1 = "This is is a test sentence.";
std::string s2 = "this sentence is a test.";
double similarity = sentence_similarity(s1, s2);
std::cout << "Similarity: " << similarity << std::endl;
return 0;
}
@icecoobe
Copy link
Author

original bug version:
`#include
#include
#include
#include
#include

using words_type = std::vectorstd::string;

namespace
{
inline void to_lower(std::string& str)
{
std::transform(str.begin(), str.end(),
str.begin(),
[](unsigned char c) { return std::tolower(c); });
}

inline void remove_punctuation(std::string& str)
{
str.erase(std::remove_if(str.begin(), str.end(), [](char c)
{ return std::ispunct(c); }), str.end());
}

words_type retrieve_words(const std::string& sentence)
{
std::vectorstd::string words;
std::istringstream iss(sentence);
std::string word;

while (iss >> word) 
{
    words.push_back(word);
}

return words;

}

words_type get_intersection(const words_type& s1, const words_type& s2)
{
words_type inter;
std::set_intersection(s1.begin(), s1.end(),
s2.begin(), s2.end(),
std::back_inserter(inter));
return inter;
}

words_type get_union(const words_type& s1, const words_type& s2)
{
words_type union_vector;
std::set_union(s1.begin(), s1.end(),
s2.begin(), s2.end(),
std::back_inserter(union_vector));
return union_vector;
}
} // namespace {}

float calc_similarity(const std::string& s1, const std::string& s2)
{
auto temp1 = s1;
auto temp2 = s2;

// normalize
to_lower(temp1);
to_lower(temp2);
remove_punctuation(temp1);
remove_punctuation(temp2);

auto v1 = retrieve_words(temp1);
for (const auto& w : v1) {
    std::cout << w << std::endl;
}
auto v2 = retrieve_words(temp2);
for (const auto& w : v2) {
    std::cout << w << std::endl;
}
auto intersection = get_intersection(v1, v2);
auto union_vector = get_union(v1, v2);

std::cout << "==== intersection ====" << std::endl;
for (const auto& str : intersection) {
    std::cout << str << " ";
}
std::cout << std::endl;

std::cout << "==== union ====" << std::endl;
for (const auto& str : union_vector) {
    std::cout << str << " ";
}
std::cout << std::endl;

if (union_vector.size() != 0)
{
    return intersection.size() / (float)union_vector.size();
}

return 0.0f;

}

int main()
{
std::string str = "This is a sentence. It contains punctuation marks!";
std::string str2 = "his is a sentence. It contains punctuation marks!";

// to_lower(str);
// remove_punctuation(str);

// std::cout << str << std::endl;

// std::string sentence = "This is a sentence.";
// remove_punctuation(sentence);
// auto words = retrieve_words(sentence);

// for (const auto& w : words) {
//     std::cout << w << std::endl;
// }

words_type v1 = {"hello", "world", "foo", "bar" };
words_type v2 = {"world", "bar", "baz", "hello" };

auto intersection = get_intersection(v1, v2);
std::cout << "==== intersection ====" << std::endl;
for (const auto& str : intersection) {
    std::cout << str << " ";
}
std::cout << std::endl;

// auto union_vector = get_union(v1, v2);

// for (const auto& str : union_vector) {
//     std::cout << str << " ";
// }
// std::cout << std::endl;

auto sim = calc_similarity(str, str2);
std::cout << sim << std::endl;

return 0;

}`

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment