Created
June 5, 2013 14:45
-
-
Save eloraburns/5714380 to your computer and use it in GitHub Desktop.
Cartesian-select performance in SQLAlchemy 0.7.9 with 2-layers of loads.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sqlalchemy.ext.declarative import declarative_base | |
from sqlalchemy.orm import relationship, sessionmaker | |
import sqlalchemy as sa | |
# >>> sa.__version__ | |
# '0.7.9' | |
engine = sa.create_engine('sqlite:///cartesian_subquery.db', echo=True) | |
Base = declarative_base() | |
if 1: | |
parent_child_lazy = 'joined' | |
else: | |
# This does not change the child_attribute queries below, but adds a little | |
# extra query noise. | |
parent_child_lazy = 'subquery' | |
class Parent(Base): | |
__tablename__ = 'parent' | |
id = sa.Column(sa.Integer, primary_key=True) | |
left_child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id')) | |
left_child = relationship( | |
'Child', primaryjoin='Child.id == Parent.left_child_id', | |
lazy=parent_child_lazy, uselist=False) | |
right_child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id')) | |
right_child = relationship( | |
'Child', primaryjoin='Child.id == Parent.right_child_id', | |
lazy=parent_child_lazy, uselist=False) | |
class Child(Base): | |
__tablename__ = 'child' | |
id = sa.Column(sa.Integer, primary_key=True) | |
attributes = relationship('ChildAttribute', lazy='subquery') | |
class ChildAttribute(Base): | |
__tablename__ = 'child_attribute' | |
id = sa.Column(sa.Integer, primary_key=True) | |
child_id = sa.Column(sa.Integer, sa.ForeignKey('child.id')) | |
Session = sessionmaker(autocommit=False, bind=engine) | |
Base.metadata.drop_all(bind=engine) | |
Base.metadata.create_all(bind=engine) | |
session = Session() | |
session.add(Child(id=1)) | |
session.add_all([ | |
ChildAttribute(id=10, child_id=1), | |
ChildAttribute(id=11, child_id=1), | |
Child(id=2), | |
ChildAttribute(id=20, child_id=2), | |
ChildAttribute(id=21, child_id=2), | |
Parent(id=100, left_child_id=1, right_child_id=2), | |
Parent(id=101, left_child_id=1, right_child_id=2), | |
]) | |
session.commit() | |
# This query: | |
parents = session.query(Parent).all() | |
# Generates these SQL queries. | |
# The first one looks fine... | |
# sqlite> SELECT parent.id AS parent_id, parent.left_child_id AS parent_left_child_id, parent.right_child_id AS parent_right_child_id, child_1.id AS child_1_id, child_2.id AS child_2_id | |
# ...> FROM parent LEFT OUTER JOIN child AS child_1 ON child_1.id = parent.left_child_id LEFT OUTER JOIN child AS child_2 ON child_2.id = parent.right_child_id; | |
# 100|1|2|1|2 | |
# 101|1|2|1|2 | |
# The second one returns duplicate rows, because it's looking up the ChildAttributes for each Child as left-joined to the Parent, even though the parent is not selected for (or cared about). | |
# sqlite> SELECT child_attribute.id AS child_attribute_id, child_attribute.child_id AS child_attribute_child_id, child_1.id AS child_1_id | |
# ...> FROM (SELECT parent.left_child_id AS parent_left_child_id | |
# ...> FROM parent) AS anon_1 JOIN child AS child_1 ON child_1.id = anon_1.parent_left_child_id JOIN child_attribute ON child_1.id = child_attribute.child_id ORDER BY child_1.id; | |
# 10|1|1 | |
# 11|1|1 | |
# 10|1|1 | |
# 11|1|1 | |
# Same goes here, selecting on right_child_id. | |
# sqlite> SELECT child_attribute.id AS child_attribute_id, child_attribute.child_id AS child_attribute_child_id, child_1.id AS child_1_id | |
# ...> FROM (SELECT parent.right_child_id AS parent_right_child_id | |
# ...> FROM parent) AS anon_1 JOIN child AS child_1 ON child_1.id = anon_1.parent_right_child_id JOIN child_attribute ON child_1.id = child_attribute.child_id ORDER BY child_1.id; | |
# 20|2|2 | |
# 21|2|2 | |
# 20|2|2 | |
# 21|2|2 | |
# Is there a better way to express these queries so that we get just the set of | |
# rows we care about, without duplicates? The "easy" way is adding a "DISTINCT" | |
# keyword at the front. This drastically improved MySQL and Python performance | |
# in my original workload (100krows down to 300rows), but I don't know if there | |
# might be a better way to express the requirements. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nope, it doesn't help on the actual production queries. But I can see why…for some reason, given the code we actually have (table names changed to protect the innocent):
I guess it's the inner
ORDER BY
(set by higher-level calling code, because we wanted the parent collection sorted when returned byquery.all()
) causing theparent.id
to materialize in the innerSELECT
, so the innerDISTINCT
can't do its job there. :(