matthiasgoergens/match.py

## match.py
"""
https://www.reddit.com/r/algorithms/comments/vi75ip/equal_sums_for_subsets_of_2_different_lists/

For example, lets say you have a warehouse and you want to keep a record of the merchandise, bananas, that goes in and out. If you have two people keeping records and they work independently, we would expect them to have the same recorded values and that their records will be identical. However, it is possible that one person created one record for two shipments, creating a sum that I want to detect here. It is also possible that one person miss counted and that value should not have a matching element to the other person's records. Ultimately, it is erroneous and non-matching entries that I want to identify. If they do not match, then we know there was a recording error somewhere and we need to investigate that.

I'm trying to reconcile those records. First, I remove all entries that do match up perfectly. Then, I need to find out if there is some combination of records for person A that matches up with some combinations of records from person B. Unfortunately, there is no way to simplify the record keeping process, there will always be two and they are always independent.

---

I have been looking for any way to group the elements. However:

- We cannot assume that sequentially added entries are sequential in the real world

- There are no dates to work with

- Either list, or both of them, could be incomplete

- There are no text labels that can be used to create relationships between individual entries or even to create subsets of the records

It really is the worst case scenario. Two lists, full of numbers. Sometimes +100.

I recently finished a brute force method that kind of works and finds some entries. I start searching and remove any entries that reconcile from the list of possible candidates as it goes. I'm also storing sums in a dictionary and removing entries that are no long required when elements get matched. This works to a very small degree. One of my test sets has over 100 entries in each set of logs. I gave up on that one after an hour and 16 gigs of ram usage.

I'm working on adding some limitations. They are somewhat arbitrary but based on what would make sense.

Currently, I am doing some research to see if I can find a more clever solution. You had mentioned integer linear programming. I don't know what this is but I am looking into it.

I'll look into an edit distance algorithm. That could be helpful.

---

Interesting. I'll look into Subset Sum but unfortunately I cannot guarantee that the values will be positive.

---

So setup is:

We create a bunch of elements (arbitrary rational numbers).

Then we create two transaction lists from them:
- shuffle the elements
- wiggle some of them
- drop some of them
- group some of them
- shuffle again

Then try to reconcile the transaction lists to recover the orignial list of elements.

'some of them' corresponds to specific fractions, ie errors are rare-ish.

--

For the recovery, we use integer linear programming?
(Or something from compressed sensing???)

We could directly try to find subsets that add up to the same.
Or alternatively, we could try to recreate the elements, and how they got dropped, wiggled, grouped and shuffled.

Then minimize the dropping, wiggling and grouping.

--

shuffling == quadratic number of variables?
"""

from ortools.linear_solver import pywraplp
import random
from pprint import pp
from collections import Counter
import itertools as it
import time

import threading

MAX_ELEMENT = 100


def create_elements(n):
    return [round(random.uniform(-MAX_ELEMENT, MAX_ELEMENT), 3) for _ in range(n)]


def shuffle(items):
    items = items[:]
    random.shuffle(items)
    return items


def drop(items, p_drop):
    return [item for item in items if random.random() > p_drop]


def wiggle(items, p_wiggle):
    return [item + random.choices((round(random.gauss(0, 1), 3), 0), cum_weights=(p_wiggle, 1))[0] for item in items]


def group(items, p_group):
    group = []
    grouped = []
    for item in items:
        group.append(item)
        if random.random() > p_group:
            grouped.append(sum(group))
            group = []
    else:
        if len(group) > 0:
            grouped.append(sum(group))
    return grouped


# p_* are the probabilities of various errors.
def add_noise(elements, p_drop=0.01, p_wiggle=0.01, p_group=0.05):
    # Consider logging the noise somewhere,
    # so we can see how well we managed to reconstruct?
    items = shuffle(elements)
    items = drop(items, p_drop=p_drop)
    items = wiggle(items, p_wiggle=p_wiggle)
    items = group(items, p_group=p_group)
    return shuffle(items)


def create_input(n):
    # items is the 'secret' we want to reconstruct
    # from the two noisy measurements.
    items = create_elements(n)
    a = add_noise(items, p_wiggle=0)
    b = add_noise(items, p_wiggle=0)
    return items, a, b

# python3 -m pip install ortools


def counter_to_list(c):
    return sorted([x for item, count in c.items() for x in count * [item]])


def match_same(a, b):
    # print(f"len(a): {len(a)}\tlen(b): {len(b)}")
    a = Counter(a)
    b = Counter(b)

    # Remove common elements that just match directly.
    common = a & b
    a_only = a - common
    b_only = b - common

    # Just to make sure we are using the API correctly:
    assert a_only + common == a
    assert b_only + common == b

    return counter_to_list(a_only), counter_to_list(b_only), counter_to_list(common)


def recover(a, b):
    a_only, b_only, common = match_same(a, b)
    pp(len(a_only))
    pp(len(b_only))
    return ilp(a_only, b_only)


def ilp(xs, ys):
    if len(xs) == 0 or len(ys) == 0:
        # You probably want to do something more sensible here in production code.
        print("Nothing left to do")
        return

    solver = pywraplp.Solver.CreateSolver('SCIP')
    # This is for debug output:
    solver.EnableOutput()
    # NOTE: set depending on how many cores you have.
    solver.SetNumThreads(6)

    infinity = solver.infinity()

    # We accumulate penalties to be minimized for our solution in this variable.
    penalty = 0

    pipe_flow = {(xi, yi): solver.NumVar(
        lb=-infinity, ub=infinity, name=f'flow_{xi}_{yi}') for xi, x in enumerate(xs) for yi, y in enumerate(ys)}

    pipe_used = {(xi, yi): solver.BoolVar(
        name=f'pipe_used_{xi}_{yi}') for xi, x in enumerate(xs) for yi, y in enumerate(ys)}
    # Ask for sparsity:
    for xi, x in enumerate(xs):
        for yi, y in enumerate(ys):
            # Some of our elements can be negative, so need to constrain both
            # from above and from below.
            # NOTE: I don't know whether 10 here is a good constant.
            # This sparsity constraint is the slowest part of the solving.
            solver.Add(10 * MAX_ELEMENT *
                       pipe_used[xi, yi] >= pipe_flow[xi, yi])
            solver.Add(-10 * MAX_ELEMENT *
                       pipe_used[xi, yi] <= pipe_flow[xi, yi])

    total_pipes_used = 0
    for is_used in pipe_used.values():
        # TODO: What's a good weight for the penalty?
        # We perturb the penalty a bit to break symmetries.
        total_pipes_used += is_used
    total_pipes_used_var = solver.NumVar(
        lb=0, ub=infinity, name='total_pipes_used')

    solver.Add(total_pipes_used_var >= total_pipes_used)
    # Tell the solver not to bother minimizing the pipes used below a reasonable minimum:
    min_pipes_used = max(len(xs), len(ys))
    solver.Add(total_pipes_used_var >= min_pipes_used)

    penalty += total_pipes_used_var

    pipe_flow_yx = {(yi, xi): flow for (xi, yi), flow in pipe_flow.items()}

    for who, groups, flows in [('xs', xs, pipe_flow), ('ys', ys, pipe_flow_yx)]:
        for g_i, g in enumerate(groups):
            slack_left = solver.NumVar(
                lb=0, ub=infinity, name=f'group_slack_left_{g}')
            slack_right = solver.NumVar(
                lb=0, ub=infinity, name=f'group_slack_right_{g}')

            solver.Add(g + slack_left == slack_right +
                       solver.Sum(flow for (mi, ni), flow in flows.items() if mi == g_i))

            # TODO: what's a good multiplier for the penalty?
            penalty += 1 * (slack_left + slack_right)
            # penalty.append(slack_left)
            # penalty.append(slack_right)
            # penalize slack
    solver.Minimize(penalty)
    def solve():
        print("Starting solve")
        # solver.parameters.max_time_in_seconds = 10.0
        solver.SetTimeLimit(60)
        time.sleep(1)
        status = solver.Solve()

        if status == pywraplp.Solver.OPTIMAL:
            print('Solution:')
            print('Objective value =', solver.Objective().Value())
            # print('x =', x.solution_value())
            # print('y =', y.solution_value())
        else:
            print('The problem does not have an optimal solution.')

        print('\nAdvanced usage:')
        print('Problem solved in %f milliseconds' % solver.wall_time())
        print('Problem solved in %d iterations' % solver.iterations())
        print('Problem solved in %d branch-and-bound nodes' % solver.nodes())

    # Multi-threading doesn't really seem to work, yet..
    t = threading.Thread(target=solve)
    t.start()
    import signal
    import sys
    def done(*args, **kwargs):
        print("Got interrupt.")
        solver.InterruptSolve()
        t.join()
        print("Done")
        return sys.exit(0)


    signal.signal(signal.SIGINT, done)

    while True:
        print(f"iterations: {solver.Iterations()}")
        time.sleep(1)


def analysis(items):
    pp((len(items)))


if __name__ == '__main__':
    # main()
    random.seed(0)
    items, a, b = create_input(n=200)
    analysis(items)
    analysis(a)
    analysis(b)
    recover(a, b)
	"""
	https://www.reddit.com/r/algorithms/comments/vi75ip/equal_sums_for_subsets_of_2_different_lists/

	For example, lets say you have a warehouse and you want to keep a record of the merchandise, bananas, that goes in and out. If you have two people keeping records and they work independently, we would expect them to have the same recorded values and that their records will be identical. However, it is possible that one person created one record for two shipments, creating a sum that I want to detect here. It is also possible that one person miss counted and that value should not have a matching element to the other person's records. Ultimately, it is erroneous and non-matching entries that I want to identify. If they do not match, then we know there was a recording error somewhere and we need to investigate that.

	I'm trying to reconcile those records. First, I remove all entries that do match up perfectly. Then, I need to find out if there is some combination of records for person A that matches up with some combinations of records from person B. Unfortunately, there is no way to simplify the record keeping process, there will always be two and they are always independent.

	---

	I have been looking for any way to group the elements. However:

	- We cannot assume that sequentially added entries are sequential in the real world

	- There are no dates to work with

	- Either list, or both of them, could be incomplete

	- There are no text labels that can be used to create relationships between individual entries or even to create subsets of the records

	It really is the worst case scenario. Two lists, full of numbers. Sometimes +100.

	I recently finished a brute force method that kind of works and finds some entries. I start searching and remove any entries that reconcile from the list of possible candidates as it goes. I'm also storing sums in a dictionary and removing entries that are no long required when elements get matched. This works to a very small degree. One of my test sets has over 100 entries in each set of logs. I gave up on that one after an hour and 16 gigs of ram usage.

	I'm working on adding some limitations. They are somewhat arbitrary but based on what would make sense.

	Currently, I am doing some research to see if I can find a more clever solution. You had mentioned integer linear programming. I don't know what this is but I am looking into it.

	I'll look into an edit distance algorithm. That could be helpful.

	---

	Interesting. I'll look into Subset Sum but unfortunately I cannot guarantee that the values will be positive.

	---

	So setup is:

	We create a bunch of elements (arbitrary rational numbers).

	Then we create two transaction lists from them:
	- shuffle the elements
	- wiggle some of them
	- drop some of them
	- group some of them
	- shuffle again

	Then try to reconcile the transaction lists to recover the orignial list of elements.

	'some of them' corresponds to specific fractions, ie errors are rare-ish.

	--

	For the recovery, we use integer linear programming?
	(Or something from compressed sensing???)

	We could directly try to find subsets that add up to the same.
	Or alternatively, we could try to recreate the elements, and how they got dropped, wiggled, grouped and shuffled.

	Then minimize the dropping, wiggling and grouping.

	--

	shuffling == quadratic number of variables?
	"""

	from ortools.linear_solver import pywraplp
	import random
	from pprint import pp
	from collections import Counter
	import itertools as it
	import time

	import threading

	MAX_ELEMENT = 100


	def create_elements(n):
	return [round(random.uniform(-MAX_ELEMENT, MAX_ELEMENT), 3) for _ in range(n)]


	def shuffle(items):
	items = items[:]
	random.shuffle(items)
	return items


	def drop(items, p_drop):
	return [item for item in items if random.random() > p_drop]


	def wiggle(items, p_wiggle):
	return [item + random.choices((round(random.gauss(0, 1), 3), 0), cum_weights=(p_wiggle, 1))[0] for item in items]


	def group(items, p_group):
	group = []
	grouped = []
	for item in items:
	group.append(item)
	if random.random() > p_group:
	grouped.append(sum(group))
	group = []
	else:
	if len(group) > 0:
	grouped.append(sum(group))
	return grouped


	# p_* are the probabilities of various errors.
	def add_noise(elements, p_drop=0.01, p_wiggle=0.01, p_group=0.05):
	# Consider logging the noise somewhere,
	# so we can see how well we managed to reconstruct?
	items = shuffle(elements)
	items = drop(items, p_drop=p_drop)
	items = wiggle(items, p_wiggle=p_wiggle)
	items = group(items, p_group=p_group)
	return shuffle(items)


	def create_input(n):
	# items is the 'secret' we want to reconstruct
	# from the two noisy measurements.
	items = create_elements(n)
	a = add_noise(items, p_wiggle=0)
	b = add_noise(items, p_wiggle=0)
	return items, a, b

	# python3 -m pip install ortools


	def counter_to_list(c):
	return sorted([x for item, count in c.items() for x in count * [item]])


	def match_same(a, b):
	# print(f"len(a): {len(a)}\tlen(b): {len(b)}")
	a = Counter(a)
	b = Counter(b)

	# Remove common elements that just match directly.
	common = a & b
	a_only = a - common
	b_only = b - common

	# Just to make sure we are using the API correctly:
	assert a_only + common == a
	assert b_only + common == b

	return counter_to_list(a_only), counter_to_list(b_only), counter_to_list(common)


	def recover(a, b):
	a_only, b_only, common = match_same(a, b)
	pp(len(a_only))
	pp(len(b_only))
	return ilp(a_only, b_only)


	def ilp(xs, ys):
	if len(xs) == 0 or len(ys) == 0:
	# You probably want to do something more sensible here in production code.
	print("Nothing left to do")
	return

	solver = pywraplp.Solver.CreateSolver('SCIP')
	# This is for debug output:
	solver.EnableOutput()
	# NOTE: set depending on how many cores you have.
	solver.SetNumThreads(6)

	infinity = solver.infinity()

	# We accumulate penalties to be minimized for our solution in this variable.
	penalty = 0

	pipe_flow = {(xi, yi): solver.NumVar(
	lb=-infinity, ub=infinity, name=f'flow_{xi}_{yi}') for xi, x in enumerate(xs) for yi, y in enumerate(ys)}

	pipe_used = {(xi, yi): solver.BoolVar(
	name=f'pipe_used_{xi}_{yi}') for xi, x in enumerate(xs) for yi, y in enumerate(ys)}
	# Ask for sparsity:
	for xi, x in enumerate(xs):
	for yi, y in enumerate(ys):
	# Some of our elements can be negative, so need to constrain both
	# from above and from below.
	# NOTE: I don't know whether 10 here is a good constant.
	# This sparsity constraint is the slowest part of the solving.
	solver.Add(10 * MAX_ELEMENT *
	pipe_used[xi, yi] >= pipe_flow[xi, yi])
	solver.Add(-10 * MAX_ELEMENT *
	pipe_used[xi, yi] <= pipe_flow[xi, yi])

	total_pipes_used = 0
	for is_used in pipe_used.values():
	# TODO: What's a good weight for the penalty?
	# We perturb the penalty a bit to break symmetries.
	total_pipes_used += is_used
	total_pipes_used_var = solver.NumVar(
	lb=0, ub=infinity, name='total_pipes_used')

	solver.Add(total_pipes_used_var >= total_pipes_used)
	# Tell the solver not to bother minimizing the pipes used below a reasonable minimum:
	min_pipes_used = max(len(xs), len(ys))
	solver.Add(total_pipes_used_var >= min_pipes_used)

	penalty += total_pipes_used_var

	pipe_flow_yx = {(yi, xi): flow for (xi, yi), flow in pipe_flow.items()}

	for who, groups, flows in [('xs', xs, pipe_flow), ('ys', ys, pipe_flow_yx)]:
	for g_i, g in enumerate(groups):
	slack_left = solver.NumVar(
	lb=0, ub=infinity, name=f'group_slack_left_{g}')
	slack_right = solver.NumVar(
	lb=0, ub=infinity, name=f'group_slack_right_{g}')

	solver.Add(g + slack_left == slack_right +
	solver.Sum(flow for (mi, ni), flow in flows.items() if mi == g_i))

	# TODO: what's a good multiplier for the penalty?
	penalty += 1 * (slack_left + slack_right)
	# penalty.append(slack_left)
	# penalty.append(slack_right)
	# penalize slack
	solver.Minimize(penalty)
	def solve():
	print("Starting solve")
	# solver.parameters.max_time_in_seconds = 10.0
	solver.SetTimeLimit(60)
	time.sleep(1)
	status = solver.Solve()

	if status == pywraplp.Solver.OPTIMAL:
	print('Solution:')
	print('Objective value =', solver.Objective().Value())
	# print('x =', x.solution_value())
	# print('y =', y.solution_value())
	else:
	print('The problem does not have an optimal solution.')

	print('\nAdvanced usage:')
	print('Problem solved in %f milliseconds' % solver.wall_time())
	print('Problem solved in %d iterations' % solver.iterations())
	print('Problem solved in %d branch-and-bound nodes' % solver.nodes())

	# Multi-threading doesn't really seem to work, yet..
	t = threading.Thread(target=solve)
	t.start()
	import signal
	import sys
	def done(args, *kwargs):
	print("Got interrupt.")
	solver.InterruptSolve()
	t.join()
	print("Done")
	return sys.exit(0)


	signal.signal(signal.SIGINT, done)

	while True:
	print(f"iterations: {solver.Iterations()}")
	time.sleep(1)




	def analysis(items):
	pp((len(items)))


	if __name__ == '__main__':
	# main()
	random.seed(0)
	items, a, b = create_input(n=200)
	analysis(items)
	analysis(a)
	analysis(b)
	recover(a, b)