tomelm/classify.py

## classify.py
# clf = sklearn.linear_model.LogisticRegression
# significant_terms = set of terms appearing more than n times in training
def classify(left_name, right_name):
    """
    Classifies names using delta term analysis.
    :return:
        A tuple (p_is_duplicate, exact_match_rare_terms, one_side_rare_terms).

        * p_is_duplicate is the score from the log-linear classifier. It's
          probably the most relevant signal.
        * exact_match_rare_terms is the number of terms that are RARE and
          exact matches. This is generally a positive signal.
        * one_side_rare_terms is a list of terms that are RARE and either
          only occur on one side, or are partial matches. This is generally
          a negative signal.
    :rtype: (float, bool, bool)
    """
    # calls an word aligner on the two names. Returns a list of
    # ExactMatchTerm(string)/OneSideTerm(string) objects
    delta_terms = get_name_delta_terms(left_name, right_name)

    significant_delta_terms = [term for term in delta_terms if term.word in significant_terms]

    (p_false, p_true), = clf.predict_proba(feature_vectorize(significant_delta_terms))

    # Signals outside of the classifier prediction
    exact_match_rare_terms = sum(
        int(term.is_rare() and isinstance(term, ExactMatchTerm))
        for term in significant_delta_terms
    )
    one_side_rare_terms = sum(
        int(term.is_rare() and isinstance(term, OneSideTerm))
        for term in significant_delta_terms
    )

    return p_true, exact_match_rare_terms, one_side_rare_terms
	# clf = sklearn.linear_model.LogisticRegression
	# significant_terms = set of terms appearing more than n times in training
	def classify(left_name, right_name):
	"""
	Classifies names using delta term analysis.
	:return:
	A tuple (p_is_duplicate, exact_match_rare_terms, one_side_rare_terms).

	* p_is_duplicate is the score from the log-linear classifier. It's
	probably the most relevant signal.
	* exact_match_rare_terms is the number of terms that are RARE and
	exact matches. This is generally a positive signal.
	* one_side_rare_terms is a list of terms that are RARE and either
	only occur on one side, or are partial matches. This is generally
	a negative signal.
	:rtype: (float, bool, bool)
	"""
	# calls an word aligner on the two names. Returns a list of
	# ExactMatchTerm(string)/OneSideTerm(string) objects
	delta_terms = get_name_delta_terms(left_name, right_name)

	significant_delta_terms = [term for term in delta_terms if term.word in significant_terms]

	(p_false, p_true), = clf.predict_proba(feature_vectorize(significant_delta_terms))

	# Signals outside of the classifier prediction
	exact_match_rare_terms = sum(
	int(term.is_rare() and isinstance(term, ExactMatchTerm))
	for term in significant_delta_terms
	)
	one_side_rare_terms = sum(
	int(term.is_rare() and isinstance(term, OneSideTerm))
	for term in significant_delta_terms
	)

	return p_true, exact_match_rare_terms, one_side_rare_terms