Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Kronuz/91be4c94b538eabb7f09ee4c0defb363 to your computer and use it in GitHub Desktop.
Save Kronuz/91be4c94b538eabb7f09ee4c0defb363 to your computer and use it in GitHub Desktop.
Xapian core Document::Internal buffered terms map
diff --git a/src/xapian/backends/documentinternal.cc b/src/xapian/backends/documentinternal.cc
index 7c87292fc..a20fda9a4 100644
--- a/src/xapian/backends/documentinternal.cc
+++ b/src/xapian/backends/documentinternal.cc
@@ -38,13 +38,16 @@ namespace Xapian {
void
Document::Internal::ensure_terms_fetched() const
{
- if (terms)
+ if (terms || terms_buffer)
return;
+ if (!database.get()) {
+ terms_buffer.reset(new std::deque<terms_buffer_value_type>);
+ return;
+ }
+
terms.reset(new terms_type());
termlist_size = 0;
- if (!database.get())
- return;
unique_ptr<TermList> t(database->open_term_list(did));
while (t->next(), !t->at_end()) {
@@ -72,6 +75,38 @@ Document::Internal::ensure_values_fetched() const
}
}
+void
+Document::Internal::apply_terms_buffer() const
+{
+ if (!terms_buffer)
+ return;
+
+ terms.reset(new terms_type());
+ termlist_size = 0;
+
+ std::vector<terms_buffer_value_type> v(terms_buffer->begin(), terms_buffer->end());
+ std::sort(v.begin(), v.end(), [](const auto& a, const auto& b) {
+ return a.term < b.term;
+ });
+ terms_buffer.reset();
+
+ for (const auto& t : v) {
+ auto i = terms->find(t.term);
+ if (i == terms->end()) {
+ ++termlist_size;
+ terms->emplace_hint(terms->end(),
+ t.term,
+ t.has_term_pos ? TermInfo(t.wdf_inc, t.term_pos) : TermInfo(t.wdf_inc));
+ } else if (t.has_term_pos) {
+ if (i->second.add_position(t.wdf_inc, t.term_pos))
+ ++termlist_size;
+ } else {
+ if (i->second.increase_wdf(t.wdf_inc))
+ ++termlist_size;
+ }
+ }
+}
+
string
Document::Internal::fetch_data() const
{
@@ -105,6 +140,8 @@ Document::Internal::set_database(const Database& db) const
TermList*
Document::Internal::open_term_list() const
{
+ apply_terms_buffer();
+
if (terms)
return new DocumentTermList(this);
@@ -139,6 +176,8 @@ Document::Internal::get_description() const
description_append(desc, *data);
}
+ apply_terms_buffer();
+
if (terms) {
desc += ", terms[";
desc += str(terms->size());
diff --git a/src/xapian/backends/documentinternal.h b/src/xapian/backends/documentinternal.h
index a1aaab178..3ba3fa636 100644
--- a/src/xapian/backends/documentinternal.h
+++ b/src/xapian/backends/documentinternal.h
@@ -32,6 +32,7 @@
#include "xapian/backends/databaseinternal.h"
#include "xapian/common/overflow.h"
+#include <deque>
#include <map>
#include <memory>
#include <string>
@@ -76,6 +77,14 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
using terms_type = std::map<std::string, TermInfo, std::less<std::string>, allocators::memory_pool_allocator<std::pair<const std::string, TermInfo>>>;
mutable std::unique_ptr<terms_type> terms;
+ struct terms_buffer_value_type {
+ std::string term;
+ Xapian::termcount wdf_inc;
+ bool has_term_pos;
+ Xapian::termpos term_pos;
+ };
+ mutable std::unique_ptr<std::deque<terms_buffer_value_type>> terms_buffer;
+
/** The number of distinct terms in @a terms.
*
* Only valid when terms is non-NULL.
@@ -110,6 +119,8 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
*/
void ensure_values_fetched() const;
+ void apply_terms_buffer() const;
+
protected:
/** Document value slots and their contents.
*
@@ -198,7 +209,7 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
* compared to the version read, otherwise it means modifications
* compared to an empty database.
*/
- bool terms_modified() const { return terms != NULL; }
+ bool terms_modified() const { return terms != NULL || terms_buffer != NULL; }
/** Return true if the document's values might have been modified.
*
@@ -251,6 +262,11 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
void add_term(const std::string& term, Xapian::termcount wdf_inc) {
ensure_terms_fetched();
+ if (terms_buffer) {
+ terms_buffer->push_back({term, wdf_inc, false, 0});
+ return;
+ }
+
auto i = terms->find(term);
if (i == terms->end()) {
++termlist_size;
@@ -265,6 +281,8 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
bool remove_term(const std::string& term) {
ensure_terms_fetched();
+ apply_terms_buffer();
+
auto i = terms->find(term);
if (i == terms->end()) {
return false;
@@ -286,6 +304,11 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
ensure_terms_fetched();
positions_modified_ = true;
+ if (terms_buffer) {
+ terms_buffer->push_back({term, wdf_inc, true, term_pos});
+ return;
+ }
+
auto i = terms->find(term);
if (i == terms->end()) {
++termlist_size;
@@ -305,6 +328,8 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
Xapian::termcount wdf_dec) {
ensure_terms_fetched();
+ apply_terms_buffer();
+
auto i = terms->find(term);
if (i == terms->end() || i->second.is_deleted()) {
return remove_posting_result::NO_TERM;
@@ -330,6 +355,8 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
Xapian::termpos& n_removed) {
ensure_terms_fetched();
+ apply_terms_buffer();
+
auto i = terms->find(term);
if (i == terms->end() || i->second.is_deleted()) {
return remove_posting_result::NO_TERM;
@@ -352,6 +379,9 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
/// Clear all terms from the document.
void clear_terms() {
+ if (terms_buffer) {
+ terms_buffer.reset();
+ }
if (!terms) {
if (database.get()) {
terms.reset(new terms_type());
@@ -370,6 +400,8 @@ class Document::Internal : public Xapian::Internal::intrusive_base {
/// Return the number of distinct terms in this document.
Xapian::termcount termlist_count() const {
+ apply_terms_buffer();
+
if (terms)
return termlist_size;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment