Skip to content

Instantly share code, notes, and snippets.

@pbellon
Last active December 1, 2020 08:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pbellon/e5440db4ed4400e67ea44ed25836ea17 to your computer and use it in GitHub Desktop.
Save pbellon/e5440db4ed4400e67ea44ed25836ea17 to your computer and use it in GitHub Desktop.
# our model class used to store articles
from models import Article
# utility to get connection to the database.
from django.db import connection
def detect_similarity(similarity_limit=0.7, chars_limit=1000, days_limit=3):
# internal function to process query results and return doublons IDs.
# this is mandatory because you'll end up with a list like below that need
# to be purged from duplicated results (again :D)
#
# Origin ID | Target ID | Similarity (%)
# --------- | --------- | --------------
# 18 | 19 | 0.8
# 19 | 18 | 0.8
# 18 | 20 | 0.9
# 20 | 18 | 0.9
def get_doublons_ids(data):
nodes = {}
good_ids = []
for n in data:
if not nodes.get(n[0]):
nodes[n[0]] = { 'to': n[2], 'date': n[1] }
for node_id in nodes:
node = nodes[node_id]
node_to_id = node['to']
node_to = nodes[node_to_id]
if node_to['date'] > node['date']:
if node_to_id not in good_ids: good_ids.append(node_to_id)
if node_id in good_ids: good_ids.remove(node_id)
else:
if node_to['date'] == node['date']:
node_id = max(node_to_id, node_id)
if not (node_id in good_ids): good_ids.append(node_id)
if node_to_id in good_ids: good_ids.remove(node_to_id)
return map(lambda k: k, filter(lambda k: k not in good_ids, nodes))
# Initialize connection to DB.
cursor = connection.cursor()
# Our first query. It's here to:
# - Use the set_limit function from pg_trgrm to limit similarity results to
# a similarity greater than the given `similarity_limit`, by default 0.7
# - Create the homogenize function to remove punctation, ignore case and
# limit the given `t` to the first `c` given characters
cursor.execute("""
SELECT set_limit({similarity_limit});
CREATE OR REPLACE FUNCTION homogenize (t text, c integer)
RETURNS text as $$
BEGIN
RETURN substring(lower(regexp_replace(t, '\W', '', 'g')) for c);
END;
$$ language plpgsql;
""".format(similarity_limit=similarity_limit))
# the proper similarity detection query
cursor.execute("""
SELECT
-- basic information needed for comparison and result processing
A.id, A.publication_date,
B.id, B.publication_date,
-- similarity computation useful to debug our query
similarity(
homogenize(A.content,{chars_limit}), homogenize(B.content, {chars_limit})
) as sim
FROM article A
JOIN article B
-- similarity JOIN, notice homogenize usage to ease computation.
ON homogenize(A.content, {chars_limit}) <> homogenize(B.content, {chars_limit})
AND homogenize(A.content, {chars_limit}) % homogenize(B.content, {chars_limit})
-- JOIN date limitation
AND B.publication_date <= A.publication_date + integer '{days_limit}'
AND B.publication_date >= A.publication_date - integer '{days_limit}'
ORDER BY sim DESC;
""".format(
days_limit=days_limit,
chars_limit=chars_limit
)
)
# retrieve results
results = cursor.fetchall()
# process them to only have a list of ids
doublons_ids = get_doublons_ids(results)
# we then convert those id to models objects and return the result
return map(lambda pk: Article.objects.get(pk=pk), doublons_ids)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment