eloraburns/cartesian_subquery.py

## cartesian_subquery.py
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
import sqlalchemy as sa

# >>> sa.__version__
# '0.7.9'

engine = sa.create_engine('sqlite:///cartesian_subquery.db', echo=True)

Base = declarative_base()

if 1:
    parent_child_lazy = 'joined'
else:
    # This does not change the child_attribute queries below, but adds a little
    # extra query noise.
    parent_child_lazy = 'subquery'

class Parent(Base):
    __tablename__ = 'parent'
    id = sa.Column(sa.Integer, primary_key=True)
    left_child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id'))
    left_child = relationship(
        'Child', primaryjoin='Child.id == Parent.left_child_id',
        lazy=parent_child_lazy, uselist=False)
    right_child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id'))
    right_child = relationship(
        'Child', primaryjoin='Child.id == Parent.right_child_id',
        lazy=parent_child_lazy, uselist=False)


class Child(Base):
    __tablename__ = 'child'
    id = sa.Column(sa.Integer, primary_key=True)
    attributes = relationship('ChildAttribute', lazy='subquery')


class ChildAttribute(Base):
    __tablename__ = 'child_attribute'
    id = sa.Column(sa.Integer, primary_key=True)
    child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id'))


Session = sessionmaker(autocommit=False, bind=engine)
Base.metadata.drop_all(bind=engine)
Base.metadata.create_all(bind=engine)

session = Session()

session.add(Child(id=1))
session.add_all([
    ChildAttribute(id=10, child_id=1),
    ChildAttribute(id=11, child_id=1),
    Child(id=2),
    ChildAttribute(id=20, child_id=2),
    ChildAttribute(id=21, child_id=2),
    Parent(id=100, left_child_id=1, right_child_id=2),
    Parent(id=101, left_child_id=1, right_child_id=2),
])
session.commit()


# This query:
parents = session.query(Parent).all()

# Generates these SQL queries.
# The first one looks fine...
# sqlite> SELECT parent.id AS parent_id, parent.left_child_id AS parent_left_child_id, parent.right_child_id AS parent_right_child_id, child_1.id AS child_1_id, child_2.id AS child_2_id
#    ...> FROM parent LEFT OUTER JOIN child AS child_1 ON child_1.id = parent.left_child_id LEFT OUTER JOIN child AS child_2 ON child_2.id = parent.right_child_id;
# 100|1|2|1|2
# 101|1|2|1|2

# The second one returns duplicate rows, because it's looking up the ChildAttributes for each Child as left-joined to the Parent, even though the parent is not selected for (or cared about).
# sqlite> SELECT child_attribute.id AS child_attribute_id, child_attribute.child_id AS child_attribute_child_id, child_1.id AS child_1_id
#    ...> FROM (SELECT parent.left_child_id AS parent_left_child_id
#    ...> FROM parent) AS anon_1 JOIN child AS child_1 ON child_1.id = anon_1.parent_left_child_id JOIN child_attribute ON child_1.id = child_attribute.child_id ORDER BY child_1.id;
# 10|1|1
# 11|1|1
# 10|1|1
# 11|1|1

# Same goes here, selecting on right_child_id.
# sqlite> SELECT child_attribute.id AS child_attribute_id, child_attribute.child_id AS child_attribute_child_id, child_1.id AS child_1_id
#    ...> FROM (SELECT parent.right_child_id AS parent_right_child_id
#    ...> FROM parent) AS anon_1 JOIN child AS child_1 ON child_1.id = anon_1.parent_right_child_id JOIN child_attribute ON child_1.id = child_attribute.child_id ORDER BY child_1.id;
# 20|2|2
# 21|2|2
# 20|2|2
# 21|2|2

# Is there a better way to express these queries so that we get just the set of
# rows we care about, without duplicates? The "easy" way is adding a "DISTINCT"
# keyword at the front. This drastically improved MySQL and Python performance
# in my original workload (100krows down to 300rows), but I don't know if there
# might be a better way to express the requirements.
	from sqlalchemy.ext.declarative import declarative_base
	from sqlalchemy.orm import relationship, sessionmaker
	import sqlalchemy as sa

	# >>> sa.__version__
	# '0.7.9'

	engine = sa.create_engine('sqlite:///cartesian_subquery.db', echo=True)

	Base = declarative_base()

	if 1:
	parent_child_lazy = 'joined'
	else:
	# This does not change the child_attribute queries below, but adds a little
	# extra query noise.
	parent_child_lazy = 'subquery'

	class Parent(Base):
	__tablename__ = 'parent'
	id = sa.Column(sa.Integer, primary_key=True)
	left_child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id'))
	left_child = relationship(
	'Child', primaryjoin='Child.id == Parent.left_child_id',
	lazy=parent_child_lazy, uselist=False)
	right_child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id'))
	right_child = relationship(
	'Child', primaryjoin='Child.id == Parent.right_child_id',
	lazy=parent_child_lazy, uselist=False)


	class Child(Base):
	__tablename__ = 'child'
	id = sa.Column(sa.Integer, primary_key=True)
	attributes = relationship('ChildAttribute', lazy='subquery')


	class ChildAttribute(Base):
	__tablename__ = 'child_attribute'
	id = sa.Column(sa.Integer, primary_key=True)
	child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id'))


	Session = sessionmaker(autocommit=False, bind=engine)
	Base.metadata.drop_all(bind=engine)
	Base.metadata.create_all(bind=engine)

	session = Session()

	session.add(Child(id=1))
	session.add_all([
	ChildAttribute(id=10, child_id=1),
	ChildAttribute(id=11, child_id=1),
	Child(id=2),
	ChildAttribute(id=20, child_id=2),
	ChildAttribute(id=21, child_id=2),
	Parent(id=100, left_child_id=1, right_child_id=2),
	Parent(id=101, left_child_id=1, right_child_id=2),
	])
	session.commit()


	# This query:
	parents = session.query(Parent).all()

	# Generates these SQL queries.
	# The first one looks fine...
	# sqlite> SELECT parent.id AS parent_id, parent.left_child_id AS parent_left_child_id, parent.right_child_id AS parent_right_child_id, child_1.id AS child_1_id, child_2.id AS child_2_id
	# ...> FROM parent LEFT OUTER JOIN child AS child_1 ON child_1.id = parent.left_child_id LEFT OUTER JOIN child AS child_2 ON child_2.id = parent.right_child_id;
	# 100\|1\|2\|1\|2
	# 101\|1\|2\|1\|2

	# The second one returns duplicate rows, because it's looking up the ChildAttributes for each Child as left-joined to the Parent, even though the parent is not selected for (or cared about).
	# sqlite> SELECT child_attribute.id AS child_attribute_id, child_attribute.child_id AS child_attribute_child_id, child_1.id AS child_1_id
	# ...> FROM (SELECT parent.left_child_id AS parent_left_child_id
	# ...> FROM parent) AS anon_1 JOIN child AS child_1 ON child_1.id = anon_1.parent_left_child_id JOIN child_attribute ON child_1.id = child_attribute.child_id ORDER BY child_1.id;
	# 10\|1\|1
	# 11\|1\|1
	# 10\|1\|1
	# 11\|1\|1

	# Same goes here, selecting on right_child_id.
	# sqlite> SELECT child_attribute.id AS child_attribute_id, child_attribute.child_id AS child_attribute_child_id, child_1.id AS child_1_id
	# ...> FROM (SELECT parent.right_child_id AS parent_right_child_id
	# ...> FROM parent) AS anon_1 JOIN child AS child_1 ON child_1.id = anon_1.parent_right_child_id JOIN child_attribute ON child_1.id = child_attribute.child_id ORDER BY child_1.id;
	# 20\|2\|2
	# 21\|2\|2
	# 20\|2\|2
	# 21\|2\|2

	# Is there a better way to express these queries so that we get just the set of
	# rows we care about, without duplicates? The "easy" way is adding a "DISTINCT"
	# keyword at the front. This drastically improved MySQL and Python performance
	# in my original workload (100krows down to 300rows), but I don't know if there
	# might be a better way to express the requirements.