Skip to content

Instantly share code, notes, and snippets.

@djour
Created April 19, 2012 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save djour/2422049 to your computer and use it in GitHub Desktop.
Save djour/2422049 to your computer and use it in GitHub Desktop.
Patch for Implementation of tf-idf scheme(basic scheme in SMART)
diff --git a/xapian-core/include/xapian/weight.h b/xapian-core/include/xapian/weight.h
index 09cbe46..0d9a3ac 100644
--- a/xapian-core/include/xapian/weight.h
+++ b/xapian-core/include/xapian/weight.h
@@ -506,6 +506,51 @@ class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
double get_maxextra() const;
};
+/** Xapian::Weight subclass implementing the basic tf-idf scheme
+ *
+ * This class implements the basic tf-idf Weighting scheme, as
+ * described in SMART, the corresponding parameters string in
+ * SMART is nnn. That means:
+ * new-tf = tf.
+ * new-wt = new-tf.
+ * norm-weight = new-wt.
+ * no parameter in this basic implenmentation, in the future,
+ * parameter can be used to specify the different re-computation of
+ * tf, df and normalization of entire subvector.
+*/
+class XAPIAN_VISIBILITY_DEFAULT Tf_idfWeight : public Weight {
+ /// idf.
+ mutable double idf;
+
+ Tf_idfWeight * clone() const;
+
+ void init(double factor);
+
+ public:
+ /** Construct a tf-idf weight.
+ * add parameters to specify different re-computation
+ * in the future.
+ */
+ Tf_idfWeight() {
+ need_stat(COLLECTION_SIZE);
+ need_stat(TERMFREQ);
+ need_stat(WDF);
+ need_stat(WDF_MAX);
+ }
+
+ std::string name() const;
+
+ std::string serialise() const;
+ Tf_idfWeight * unserialise(const std::string & s) const;
+
+ double get_sumpart(Xapian::termcount wdf,
+ Xapian::termcount doclen) const;
+ double get_maxpart() const;
+
+ double get_sumextra(Xapian::termcount doclen) const;
+ double get_maxextra() const;
+};
+
}
#endif // XAPIAN_INCLUDED_WEIGHT_H
diff --git a/xapian-core/weight/tf_idf.cc b/xapian-core/weight/tf_idf.cc
index e69de29..d0fb9c9 100644
--- a/xapian-core/weight/tf_idf.cc
+++ b/xapian-core/weight/tf_idf.cc
@@ -0,0 +1,99 @@
+/** @file it_idfweight.cc
+ * @brief Xapian::TradWeight class - tf-idf weighting scheme
+ */
+/* Copyright (C) 2012 Jiuding Duan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <config.h>
+
+#include <xapian/weight.h>
+
+#include "debuglog.h"
+#include "omassert.h"
+#include "serialise-double.h"
+
+#include "xapian/error.h"
+
+#include <cmath>
+
+using namespace std;
+
+namespace Xapian {
+
+Tf_idfWeight *
+Tf_idfWeight::clone() const
+{
+ return new Tf_idfWeight();
+}
+
+void
+Tf_idfWeight::init(double factor)
+{
+ Xapian::doccount tf = get_termfreq();
+ Xapian::doccount N = get_collection_size();
+
+ idf = log(N/tf);
+}
+
+string
+Tf_idfWeight::name() const
+{
+ return "Xapian::Tf_idfWeight";
+}
+
+string
+Tf_idfWeight::serialise() const
+{
+ return string();
+}
+
+Tf_idfWeight *
+Tf_idfWeight::unserialise(const string &) const
+{
+ return new Tf_idfWeight;
+}
+
+double
+Tf_idfWeight::get_sumpart(Xapian::termcount wdf, Xapian::termcount len)
+const
+{
+ double wdf_double(wdf);
+ return wdf_double * idf;
+}
+
+double
+Tf_idfWeight::get_maxpart() const
+{
+ double wdf_max(max(get_wdf_upper_bound(),Xapian::termcount(1)));
+ return wdf_max * idf;
+}
+
+double
+Tf_idfWeight::get_sumextra(Xapian::termcount) const
+{
+ return 0;
+}
+
+double
+Tf_idfWeight::get_maxextra() const
+{
+ return 0;
+}
+
+}
+
+
diff --git a/xapian-core/weight/tf_idfweight.cc b/xapian-core/weight/tf_idfweight.cc
deleted file mode 100644
index e69de29..0000000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment