Skip to content

Instantly share code, notes, and snippets.

@jdangerx
Last active August 18, 2023 18:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jdangerx/533d873f51567a380c7b41e9c37ef56f to your computer and use it in GitHub Desktop.
Save jdangerx/533d873f51567a380c7b41e9c37ef56f to your computer and use it in GitHub Desktop.
total axes playground
diff --git a/src/ferc_xbrl_extractor/datapackage.py b/src/ferc_xbrl_extractor/datapackage.py
index a1deb76..3bf2673 100644
--- a/src/ferc_xbrl_extractor/datapackage.py
+++ b/src/ferc_xbrl_extractor/datapackage.py
@@ -351,34 +351,29 @@ class FactTable:
Args:
instance: Parsed XBRL instance used to construct dataframe.
"""
- # Loop through contexts and get facts in each context
- # Each context corresponds to one unique row
- df = {}
- for column in self.data_columns:
- df[column] = {
- c_id: self.columns[column](fact.value)
- for c_id, fact in instance.get_facts(
- self.instant, column, self.schema.primary_key
- ).items()
- }
-
- # Create dataframe indexed by context ID
- df = pd.DataFrame(df)
-
- # Expand context ID to contain columns for date, entity ID, and any axes
- primary_key = pd.DataFrame(
- {
- c_id: instance.contexts[c_id].as_primary_key(
- instance.filing_name, self.axes
- )
- for c_id in df.index
- }
+ facts = pd.DataFrame(
+ fact.dict()
+ for fact in instance.get_facts(
+ self.instant, self.data_columns, self.schema.primary_key
+ )
)
+ if facts.empty:
+ return facts
- # Join on context ID
- df = primary_key.T.join(df)
- # Drop empty rows
- return df.dropna(how="all")
+ facts_concepts_wide = (
+ facts.drop_duplicates().set_index(["c_id", "name"])["value"].unstack("name")
+ )
+ contexts = facts_concepts_wide.index.to_series().apply(
+ lambda c_id: pd.Series(
+ instance.contexts[c_id].as_primary_key(instance.filing_name, self.axes)
+ )
+ )
+
+ return (
+ contexts.join(facts_concepts_wide)
+ .set_index(self.schema.primary_key)
+ .dropna(how="all")
+ )
class Datapackage(BaseModel):
diff --git a/src/ferc_xbrl_extractor/instance.py b/src/ferc_xbrl_extractor/instance.py
index de7d6c3..1a8eb57 100644
--- a/src/ferc_xbrl_extractor/instance.py
+++ b/src/ferc_xbrl_extractor/instance.py
@@ -1,5 +1,7 @@
"""Parse a single instance."""
import io
+import itertools
+from collections import defaultdict
from enum import Enum, auto
from typing import BinaryIO
@@ -242,7 +244,7 @@ class Instance:
self.contexts = contexts
def get_facts(
- self, instant: bool, concept_name: str, primary_key: list[str]
+ self, instant: bool, concept_names: str, primary_key: list[str]
) -> dict[str, list[Fact]]:
"""Return a dictionary that maps Context ID's to a list of facts for each context.
@@ -256,11 +258,14 @@ class Instance:
else:
period_fact_dict = self.duration_facts
- return {
- fact.c_id: fact
- for fact in period_fact_dict.get(concept_name, [])
+ all_facts_for_concepts = itertools.chain.from_iterable(
+ period_fact_dict[concept_name] for concept_name in concept_names
+ )
+ return (
+ fact
+ for fact in all_facts_for_concepts
if self.contexts[fact.c_id].check_dimensions(primary_key)
- }
+ )
class InstanceBuilder:
@@ -307,8 +312,8 @@ class InstanceBuilder:
# Dictionary mapping context ID's to fact structures
# Allows looking up all facts with a specific context ID
- instant_facts: dict[str, list[Fact]] = {}
- duration_facts: dict[str, list[Fact]] = {}
+ instant_facts: dict[str, list[Fact]] = defaultdict(list)
+ duration_facts: dict[str, list[Fact]] = defaultdict(list)
# Find all contexts in XML file
contexts = root.findall(f"{{{XBRL_INSTANCE}}}context")
@@ -325,15 +330,11 @@ class InstanceBuilder:
for fact in facts:
new_fact = Fact.from_xml(fact)
- # Sort facts by period type and by name
+ # Sort facts by period type
if new_fact.value is not None:
if context_dict[new_fact.c_id].period.instant:
- if new_fact.name not in instant_facts:
- instant_facts[new_fact.name] = []
instant_facts[new_fact.name].append(new_fact)
else:
- if new_fact.name not in duration_facts:
- duration_facts[new_fact.name] = []
duration_facts[new_fact.name].append(new_fact)
return Instance(context_dict, instant_facts, duration_facts, self.name)
diff --git a/tests/integration/datapackage_test.py b/tests/integration/datapackage_test.py
index fa4a6e4..fac1640 100644
--- a/tests/integration/datapackage_test.py
+++ b/tests/integration/datapackage_test.py
@@ -1,4 +1,6 @@
"""Test datapackage descriptor from taxonomy."""
+import io
+
import pandas as pd
import pytest
from frictionless import Package
@@ -60,87 +62,36 @@ def _create_schema(instant=True, axes=None):
(
_create_schema(instant=False),
"duration",
- pd.DataFrame(
- {
- "cid_1": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "start_date": "2021-01-01",
- "end_date": "2021-12-31",
- "column_one": "value 1",
- "column_two": "value 2",
- },
- "cid_4": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "start_date": "2020-01-01",
- "end_date": "2020-12-31",
- "column_one": "value 3",
- "column_two": "value 4",
- },
- }
- ).T,
+ pd.read_csv(
+ io.StringIO(
+ "c_id,entity_id,filing_name,start_date,end_date,column_one,column_two\n"
+ 'cid_1,EID1,filing,2021-01-01,2021-12-31,"value 1","value 2"\n'
+ 'cid_4,EID1,filing,2020-01-01,2020-12-31,"value 3","value 4"\n'
+ )
+ ),
),
(
_create_schema(instant=False, axes=["dimension_one_axis"]),
"duration",
- pd.DataFrame(
- {
- "cid_1": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "start_date": "2021-01-01",
- "end_date": "2021-12-31",
- "dimension_one_axis": "Total",
- "column_one": "value 1",
- "column_two": "value 2",
- },
- "cid_4": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "start_date": "2020-01-01",
- "end_date": "2020-12-31",
- "dimension_one_axis": "Total",
- "column_one": "value 3",
- "column_two": "value 4",
- },
- "cid_5": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "start_date": "2020-01-01",
- "end_date": "2020-12-31",
- "dimension_one_axis": "Dim 1 Value",
- "column_one": "value 9",
- "column_two": "value 10",
- },
- }
- ).T,
+ pd.read_csv(
+ io.StringIO(
+ "c_id,entity_id,filing_name,start_date,end_date,dimension_one_axis,column_one,column_two\n"
+ 'cid_1,EID1,filing,2021-01-01,2021-12-31,Total,"value 1","value 2"\n'
+ 'cid_4,EID1,filing,2020-01-01,2020-12-31,Total,"value 3","value 4"\n'
+ 'cid_5,EID1,filing,2020-01-01,2020-12-31,"Dim 1 Value","value 9","value 10"\n'
+ )
+ ),
),
(
_create_schema(axes=["dimension_one_axis", "dimension_two_axis"]),
"instant",
- pd.DataFrame(
- {
- "cid_2": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "date": "2021-12-31",
- "dimension_one_axis": "Total",
- "dimension_two_axis": "Total",
- "column_one": "value 5",
- "column_two": "value 6",
- },
- "cid_3": {
- "entity_id": "EID1",
- "filing_name": "filing",
- "date": "2021-12-31",
- "dimension_one_axis": "Dim 1 Value",
- "dimension_two_axis": "ferc:Dimension2Value",
- "column_one": "value 7",
- "column_two": "value 8",
- },
- }
- ).T,
+ pd.read_csv(
+ io.StringIO(
+ "c_id,entity_id,filing_name,date,dimension_one_axis,dimension_two_axis,column_one,column_two\n"
+ 'cid_2,EID1,filing,2021-12-31,Total,Total,"value 5","value 6"\n'
+ 'cid_3,EID1,filing,2021-12-31,"Dim 1 Value","ferc:Dimension2Value","value 7","value 8"\n'
+ )
+ ),
),
],
)
@@ -152,4 +103,5 @@ def test_construct_dataframe(table_schema, period, df, in_memory_filing):
fact_table = FactTable(table_schema, period)
constructed_df = fact_table.construct_dataframe(instance)
- pd.testing.assert_frame_equal(df, constructed_df)
+ expected_df = df.set_index(table_schema.primary_key).drop("c_id", axis="columns")
+ pd.testing.assert_frame_equal(expected_df, constructed_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment