Skip to content

Instantly share code, notes, and snippets.

@jbalogh
Created April 7, 2011 19:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jbalogh/908559 to your computer and use it in GitHub Desktop.
Save jbalogh/908559 to your computer and use it in GitHub Desktop.
C extension for calculating addon recommendations
#include <Python.h>
PyObject *array, *ArrayType;
/* Specialized _symmetric_diff_count for array.array() objects.
* We get to skip all the GC and rich comparison functions since array()
* stores C types directly.
*
* This calculates len(set(xs).symmetric_difference(ys)) without the
* extra data structures in between. xs and ys must be sorted in
* ascending order for the algorithm to work.
*/
static Py_ssize_t
array_symmetric_diff_count(PyListObject *a, PyListObject *b) {
Py_ssize_t i, j, rv = 0;
long x, y;
/* Exploit the ascending sort of both lists and traverse both arrays
* together, moving the pointer of the iterator with the smaller
* value. Leftovers are picked up after the loop.
*/
for (i = 0, j = 0; i < Py_SIZE(a) && j < Py_SIZE(b);) {
x = a->ob_item[i], y = b->ob_item[j];
if (x < y) {
rv++;
i++;
} else if (x > y ) {
rv++;
j++;
} else {
i++;
j++;
}
}
if (i < Py_SIZE(a)) {
rv += Py_SIZE(a) - i;
} else if (j < Py_SIZE(b)) {
rv += Py_SIZE(b) - j;
}
return rv;
}
/* Generalized version of array_symmetric_diff_count for Python objects. */
static Py_ssize_t
_symmetric_diff_count(PyObject *self, PyObject *args) {
PyListObject *a, *b;
PyObject *xs, *ys, *x, *y;
Py_ssize_t rv = 0;
int cmp, cnt=0;
/* Parse arguments and get iterators. */
if (!PyArg_ParseTuple(args, "OO", &a, &b))
return -1;
if (PyObject_IsInstance(a, ArrayType) &&
PyObject_IsInstance(b, ArrayType)) {
return array_symmetric_diff_count(a, b);
}
xs = PyObject_GetIter(a);
ys = PyObject_GetIter(b);
if (xs == NULL || ys == NULL){
Py_XDECREF(xs);
Py_XDECREF(ys);
return -1;
}
x = PyIter_Next(xs);
y = PyIter_Next(ys);
while (1) {
if (x == NULL) {
Py_DECREF(xs);
/* Swap the names so we can share the final loop. */
x = y;
xs = ys;
break;
} else if (y == NULL) {
Py_DECREF(ys);
break;
}
cmp = PyObject_Compare(x, y);
if (cmp == -1) {
Py_DECREF(x);
x = PyIter_Next(xs);
rv++;
} else if (cmp == 1) {
Py_DECREF(y);
y = PyIter_Next(ys);
rv++;
} else {
Py_DECREF(x);
Py_DECREF(y);
x = PyIter_Next(xs);
y = PyIter_Next(ys);
}
}
/* xs and x are the only PyObjects left. */
if (PyErr_Occurred()) {
Py_DECREF(x); Py_DECREF(xs);
return -1;
}
while (x != NULL) {
rv++;
Py_DECREF(x);
x = PyIter_Next(xs);
}
Py_DECREF(xs);
return rv;
}
/* A wrapper around _symmetric_diff_count so this is available for
* testing.
*/
static PyObject *
symmetric_diff_count(PyObject *self, PyObject *args) {
Py_ssize_t rv = _symmetric_diff_count(self, args);
if (rv == -1)
return rv;
return PyInt_FromLong(rv);
}
/* Calculate the similarity through a simple euclidean distance. */
static PyObject *
similarity(PyObject *self, PyObject *args) {
Py_ssize_t diff = _symmetric_diff_count(self, args);
double rv = 1. / (1. + diff);
return PyFloat_FromDouble(rv);
}
static PyMethodDef
recommend_methods[] = {
{"symmetric_diff_count", symmetric_diff_count, METH_VARARGS,
"symmetric_diff_count(list1, list2)\n\
\n\
Count the number of items that are in exactly one of the lists.\n\
Both lists are expected to be sorted in ascending order.\n"},
{"similarity", similarity, METH_VARARGS,
"similarity(list1, list2)\n\
\n\
Get a correlation coefficient between the two lists, calculated as\n\
1. / (1. + symmetric_diff_count(list1, list2)\n"},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
init_recommend(void) {
(void) Py_InitModule("_recommend", recommend_methods);
array = PyImport_ImportModule("array");
ArrayType = PyObject_GetAttrString(array, "array");
}
/* Generalized version of array_symmetric_diff_count for Python objects. */
static Py_ssize_t
_symmetric_diff_count(PyObject *self, PyObject *args) {
PyListObject *a, *b;
PyObject *xs, *ys, *x, *y;
Py_ssize_t rv = 0;
int cmp, cnt=0;
/* Parse arguments and get iterators. */
if (!PyArg_ParseTuple(args, "OO", &a, &b))
return -1;
if (PyObject_IsInstance(a, ArrayType) &&
PyObject_IsInstance(b, ArrayType)) {
return array_symmetric_diff_count(a, b);
}
xs = PyObject_GetIter(a);
ys = PyObject_GetIter(b);
if (xs == NULL || ys == NULL){
Py_XDECREF(xs);
Py_XDECREF(ys);
return -1;
}
x = PyIter_Next(xs);
y = PyIter_Next(ys);
while (1) {
if (x == NULL) {
Py_DECREF(xs);
/* Swap the names so we can share the final loop. */
x = y;
xs = ys;
break;
} else if (y == NULL) {
Py_DECREF(ys);
break;
}
cmp = PyObject_Compare(x, y);
if (cmp == -1) {
Py_DECREF(x);
x = PyIter_Next(xs);
rv++;
} else if (cmp == 1) {
Py_DECREF(y);
y = PyIter_Next(ys);
rv++;
} else {
Py_DECREF(x);
Py_DECREF(y);
x = PyIter_Next(xs);
y = PyIter_Next(ys);
}
}
/* xs and x are the only PyObjects left. */
if (PyErr_Occurred()) {
Py_DECREF(x); Py_DECREF(xs);
return -1;
}
while (x != NULL) {
rv++;
Py_DECREF(x);
x = PyIter_Next(xs);
}
Py_DECREF(xs);
return rv;
}
/* Specialized _symmetric_diff_count for array.array() objects.
* We get to skip all the GC and rich comparison functions since array()
* stores C types directly.
*
* This calculates len(set(xs).symmetric_difference(ys)) without the
* extra data structures in between. xs and ys must be sorted in
* ascending order for the algorithm to work.
*/
static Py_ssize_t
array_symmetric_diff_count(PyListObject *a, PyListObject *b) {
Py_ssize_t i, j, rv = 0;
long x, y;
/* Exploit the ascending sort of both lists and traverse both arrays
* together, moving the pointer of the iterator with the smaller
* value. Leftovers are picked up after the loop.
*/
for (i = 0, j = 0; i < Py_SIZE(a) && j < Py_SIZE(b);) {
x = a->ob_item[i], y = b->ob_item[j];
if (x < y) {
rv++;
i++;
} else if (x > y ) {
rv++;
j++;
} else {
i++;
j++;
}
}
if (i < Py_SIZE(a)) {
rv += Py_SIZE(a) - i;
} else if (j < Py_SIZE(b)) {
rv += Py_SIZE(b) - j;
}
return rv;
}
/* Calculate the similarity through a simple euclidean distance. */
static PyObject *
similarity(PyObject *self, PyObject *args) {
Py_ssize_t diff = _symmetric_diff_count(self, args);
double rv = 1. / (1. + diff);
return PyFloat_FromDouble(rv);
}
/* A wrapper around _symmetric_diff_count so this is available for
* testing.
*/
static PyObject *
symmetric_diff_count(PyObject *self, PyObject *args) {
Py_ssize_t rv = _symmetric_diff_count(self, args);
if (rv == -1)
return rv;
return PyInt_FromLong(rv);
}
static PyMethodDef
recommend_methods[] = {
{"symmetric_diff_count", symmetric_diff_count, METH_VARARGS,
"symmetric_diff_count(list1, list2)\n\
\n\
Count the number of items that are in exactly one of the lists.\n\
Both lists are expected to be sorted in ascending order.\n"},
{"similarity", similarity, METH_VARARGS,
"similarity(list1, list2)\n\
\n\
Get a correlation coefficient between the two lists, calculated as\n\
1. / (1. + symmetric_diff_count(list1, list2)\n"},
{NULL, NULL, 0, NULL}
};
PyMODINIT_FUNC
init_recommend(void) {
(void) Py_InitModule("_recommend", recommend_methods);
array = PyImport_ImportModule("array");
ArrayType = PyObject_GetAttrString(array, "array");
}
# Placeholders for the fast functions implemented in C.
def symmetric_diff_count(xs, ys):
return len(set(xs).symmetric_difference(ys))
def similarity(xs, ys):
return 1. / (1. + symmetric_diff_count(xs, ys))
try:
from _recommend import symmetric_diff_count, similarity
except ImportError:
pass
from array import array
from nose.tools import eq_
import recommend
def test_symmetric_diff_count():
def check(a, b, val):
eq_(recommend.symmetric_diff_count(a, b), val)
vals = [
([], [], 0),
([], [1], 1),
([1], [1], 0),
([], [1, 2], 2),
([], [1, 2, 3], 3),
([1, 2], [1, 3], 2),
([1, 2, 4], [1, 3], 3),
([1, 4], [1, 3], 2),
([1, 2, 4], [1, 2, 3], 2),
([1, 3, 5], [2, 4], 5),
([1, 3, 5], [1, 5], 1),
([1, 3, 5], [4, 5], 3),
]
# Flip the inputs so we test in both directions.
vals.extend([(b, a, n) for a, b, n in vals])
vals.extend([(array('l', a), array('l', b), n) for a, b, n in vals])
for a, b, val in vals:
yield check, a, b, val
# The algorithm is in flux so this is minimal coverage.
def test_similarity():
eq_(1/2., recommend.similarity([1], [1, 2]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment