Last active
August 18, 2023 18:53
-
-
Save jdangerx/533d873f51567a380c7b41e9c37ef56f to your computer and use it in GitHub Desktop.
total axes playground
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py | |
index a1deb76..3bf2673 100644 | |
--- a/src/ferc_xbrl_extractor/datapackage.py | |
+++ b/src/ferc_xbrl_extractor/datapackage.py | |
@@ -351,34 +351,29 @@ class FactTable: | |
Args: | |
instance: Parsed XBRL instance used to construct dataframe. | |
""" | |
- # Loop through contexts and get facts in each context | |
- # Each context corresponds to one unique row | |
- df = {} | |
- for column in self.data_columns: | |
- df[column] = { | |
- c_id: self.columns[column](fact.value) | |
- for c_id, fact in instance.get_facts( | |
- self.instant, column, self.schema.primary_key | |
- ).items() | |
- } | |
- | |
- # Create dataframe indexed by context ID | |
- df = pd.DataFrame(df) | |
- | |
- # Expand context ID to contain columns for date, entity ID, and any axes | |
- primary_key = pd.DataFrame( | |
- { | |
- c_id: instance.contexts[c_id].as_primary_key( | |
- instance.filing_name, self.axes | |
- ) | |
- for c_id in df.index | |
- } | |
+ facts = pd.DataFrame( | |
+ fact.dict() | |
+ for fact in instance.get_facts( | |
+ self.instant, self.data_columns, self.schema.primary_key | |
+ ) | |
) | |
+ if facts.empty: | |
+ return facts | |
- # Join on context ID | |
- df = primary_key.T.join(df) | |
- # Drop empty rows | |
- return df.dropna(how="all") | |
+ facts_concepts_wide = ( | |
+ facts.drop_duplicates().set_index(["c_id", "name"])["value"].unstack("name") | |
+ ) | |
+ contexts = facts_concepts_wide.index.to_series().apply( | |
+ lambda c_id: pd.Series( | |
+ instance.contexts[c_id].as_primary_key(instance.filing_name, self.axes) | |
+ ) | |
+ ) | |
+ | |
+ return ( | |
+ contexts.join(facts_concepts_wide) | |
+ .set_index(self.schema.primary_key) | |
+ .dropna(how="all") | |
+ ) | |
class Datapackage(BaseModel): | |
diff --git a/src/ferc_xbrl_extractor/instance.py b/src/ferc_xbrl_extractor/instance.py | |
index de7d6c3..1a8eb57 100644 | |
--- a/src/ferc_xbrl_extractor/instance.py | |
+++ b/src/ferc_xbrl_extractor/instance.py | |
@@ -1,5 +1,7 @@ | |
"""Parse a single instance.""" | |
import io | |
+import itertools | |
+from collections import defaultdict | |
from enum import Enum, auto | |
from typing import BinaryIO | |
@@ -242,7 +244,7 @@ class Instance: | |
self.contexts = contexts | |
def get_facts( | |
- self, instant: bool, concept_name: str, primary_key: list[str] | |
+ self, instant: bool, concept_names: str, primary_key: list[str] | |
) -> dict[str, list[Fact]]: | |
"""Return a dictionary that maps Context ID's to a list of facts for each context. | |
@@ -256,11 +258,14 @@ class Instance: | |
else: | |
period_fact_dict = self.duration_facts | |
- return { | |
- fact.c_id: fact | |
- for fact in period_fact_dict.get(concept_name, []) | |
+ all_facts_for_concepts = itertools.chain.from_iterable( | |
+ period_fact_dict[concept_name] for concept_name in concept_names | |
+ ) | |
+ return ( | |
+ fact | |
+ for fact in all_facts_for_concepts | |
if self.contexts[fact.c_id].check_dimensions(primary_key) | |
- } | |
+ ) | |
class InstanceBuilder: | |
@@ -307,8 +312,8 @@ class InstanceBuilder: | |
# Dictionary mapping context ID's to fact structures | |
# Allows looking up all facts with a specific context ID | |
- instant_facts: dict[str, list[Fact]] = {} | |
- duration_facts: dict[str, list[Fact]] = {} | |
+ instant_facts: dict[str, list[Fact]] = defaultdict(list) | |
+ duration_facts: dict[str, list[Fact]] = defaultdict(list) | |
# Find all contexts in XML file | |
contexts = root.findall(f"{{{XBRL_INSTANCE}}}context") | |
@@ -325,15 +330,11 @@ class InstanceBuilder: | |
for fact in facts: | |
new_fact = Fact.from_xml(fact) | |
- # Sort facts by period type and by name | |
+ # Sort facts by period type | |
if new_fact.value is not None: | |
if context_dict[new_fact.c_id].period.instant: | |
- if new_fact.name not in instant_facts: | |
- instant_facts[new_fact.name] = [] | |
instant_facts[new_fact.name].append(new_fact) | |
else: | |
- if new_fact.name not in duration_facts: | |
- duration_facts[new_fact.name] = [] | |
duration_facts[new_fact.name].append(new_fact) | |
return Instance(context_dict, instant_facts, duration_facts, self.name) | |
diff --git a/tests/integration/datapackage_test.py b/tests/integration/datapackage_test.py | |
index fa4a6e4..fac1640 100644 | |
--- a/tests/integration/datapackage_test.py | |
+++ b/tests/integration/datapackage_test.py | |
@@ -1,4 +1,6 @@ | |
"""Test datapackage descriptor from taxonomy.""" | |
+import io | |
+ | |
import pandas as pd | |
import pytest | |
from frictionless import Package | |
@@ -60,87 +62,36 @@ def _create_schema(instant=True, axes=None): | |
( | |
_create_schema(instant=False), | |
"duration", | |
- pd.DataFrame( | |
- { | |
- "cid_1": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "start_date": "2021-01-01", | |
- "end_date": "2021-12-31", | |
- "column_one": "value 1", | |
- "column_two": "value 2", | |
- }, | |
- "cid_4": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "start_date": "2020-01-01", | |
- "end_date": "2020-12-31", | |
- "column_one": "value 3", | |
- "column_two": "value 4", | |
- }, | |
- } | |
- ).T, | |
+ pd.read_csv( | |
+ io.StringIO( | |
+ "c_id,entity_id,filing_name,start_date,end_date,column_one,column_two\n" | |
+ 'cid_1,EID1,filing,2021-01-01,2021-12-31,"value 1","value 2"\n' | |
+ 'cid_4,EID1,filing,2020-01-01,2020-12-31,"value 3","value 4"\n' | |
+ ) | |
+ ), | |
), | |
( | |
_create_schema(instant=False, axes=["dimension_one_axis"]), | |
"duration", | |
- pd.DataFrame( | |
- { | |
- "cid_1": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "start_date": "2021-01-01", | |
- "end_date": "2021-12-31", | |
- "dimension_one_axis": "Total", | |
- "column_one": "value 1", | |
- "column_two": "value 2", | |
- }, | |
- "cid_4": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "start_date": "2020-01-01", | |
- "end_date": "2020-12-31", | |
- "dimension_one_axis": "Total", | |
- "column_one": "value 3", | |
- "column_two": "value 4", | |
- }, | |
- "cid_5": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "start_date": "2020-01-01", | |
- "end_date": "2020-12-31", | |
- "dimension_one_axis": "Dim 1 Value", | |
- "column_one": "value 9", | |
- "column_two": "value 10", | |
- }, | |
- } | |
- ).T, | |
+ pd.read_csv( | |
+ io.StringIO( | |
+ "c_id,entity_id,filing_name,start_date,end_date,dimension_one_axis,column_one,column_two\n" | |
+ 'cid_1,EID1,filing,2021-01-01,2021-12-31,Total,"value 1","value 2"\n' | |
+ 'cid_4,EID1,filing,2020-01-01,2020-12-31,Total,"value 3","value 4"\n' | |
+ 'cid_5,EID1,filing,2020-01-01,2020-12-31,"Dim 1 Value","value 9","value 10"\n' | |
+ ) | |
+ ), | |
), | |
( | |
_create_schema(axes=["dimension_one_axis", "dimension_two_axis"]), | |
"instant", | |
- pd.DataFrame( | |
- { | |
- "cid_2": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "date": "2021-12-31", | |
- "dimension_one_axis": "Total", | |
- "dimension_two_axis": "Total", | |
- "column_one": "value 5", | |
- "column_two": "value 6", | |
- }, | |
- "cid_3": { | |
- "entity_id": "EID1", | |
- "filing_name": "filing", | |
- "date": "2021-12-31", | |
- "dimension_one_axis": "Dim 1 Value", | |
- "dimension_two_axis": "ferc:Dimension2Value", | |
- "column_one": "value 7", | |
- "column_two": "value 8", | |
- }, | |
- } | |
- ).T, | |
+ pd.read_csv( | |
+ io.StringIO( | |
+ "c_id,entity_id,filing_name,date,dimension_one_axis,dimension_two_axis,column_one,column_two\n" | |
+ 'cid_2,EID1,filing,2021-12-31,Total,Total,"value 5","value 6"\n' | |
+ 'cid_3,EID1,filing,2021-12-31,"Dim 1 Value","ferc:Dimension2Value","value 7","value 8"\n' | |
+ ) | |
+ ), | |
), | |
], | |
) | |
@@ -152,4 +103,5 @@ def test_construct_dataframe(table_schema, period, df, in_memory_filing): | |
fact_table = FactTable(table_schema, period) | |
constructed_df = fact_table.construct_dataframe(instance) | |
- pd.testing.assert_frame_equal(df, constructed_df) | |
+ expected_df = df.set_index(table_schema.primary_key).drop("c_id", axis="columns") | |
+ pd.testing.assert_frame_equal(expected_df, constructed_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment