Skip to content

Instantly share code, notes, and snippets.

@pybites
Created March 8, 2023 08:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pybites/9406dd696a11e494dc300351dfbc0873 to your computer and use it in GitHub Desktop.
Save pybites/9406dd696a11e494dc300351dfbc0873 to your computer and use it in GitHub Desktop.
from __future__ import annotations
import argparse
import csv
import functools
import pathlib
import xml.etree.ElementTree
from collections.abc import Callable, Collection, Generator, Iterator, Mapping
from typing import TYPE_CHECKING, ParamSpec, Protocol, TypeVar
if TYPE_CHECKING:
from _typeshed import StrPath, SupportsRead
P = ParamSpec("P")
T_co = TypeVar("T_co", covariant=True)
class SupportsNext(Protocol[T_co]):
def __next__(self) -> T_co:
...
SupportsNextT = TypeVar("SupportsNextT", bound=SupportsNext)
def consumer(func: Callable[P, SupportsNextT], /) -> Callable[P, SupportsNextT]:
@functools.wraps(func)
def wrapper(*args: P.args, **kwargs: P.kwargs) -> SupportsNextT:
gen = func(*args, **kwargs)
next(gen)
return gen
return wrapper
def xml_reader(source: StrPath | SupportsRead[bytes]) -> Iterator[dict[str, str]]:
for _, elem in xml.etree.ElementTree.iterparse(source, events=["start"]):
match elem:
case xml.etree.ElementTree.Element(tag="row", attrib=attrib):
yield attrib
@consumer
def csv_writer(
path: StrPath,
fields: Collection[str],
dialect: str | csv.Dialect | type[csv.Dialect],
) -> Generator[None, Mapping[str, str], None]:
with open(path, mode="w", newline="") as f:
writer = csv.DictWriter(f, fields, extrasaction="ignore", dialect=dialect)
writer.writeheader()
while True:
row = yield
writer.writerow(row)
parser = argparse.ArgumentParser()
parser.add_argument("file", nargs="?", default="-", type=argparse.FileType(mode="rb"))
parser.add_argument("-o", "--output", default=".", type=pathlib.Path)
parser.add_argument("-d", "--dialect", default="excel")
def main(*argv: str) -> None:
args = parser.parse_args(argv or None)
writers = {
"1": csv_writer(
args.output / "Questions.csv",
["Id", "AcceptedAnswerId"],
dialect=args.dialect,
),
"2": csv_writer(
args.output / "Answers.csv",
["Id", "ParentId"],
dialect=args.dialect,
),
}
for row in xml_reader(args.file):
match row:
case {"PostTypeId": post_type_id, **fields}:
if writer := writers.get(post_type_id):
writer.send(fields)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment