Skip to content

Instantly share code, notes, and snippets.

@kylebgorman
Last active September 24, 2019 21:17
Show Gist options
  • Save kylebgorman/111aa961b9b454bded1a82de056fae1a to your computer and use it in GitHub Desktop.
Save kylebgorman/111aa961b9b454bded1a82de056fae1a to your computer and use it in GitHub Desktop.
Splits Gigaword SGML documents into separate files
#!/usr/bin/env python
"""Extracts documents from the Gigaword SGML."""
import argparse
import logging
import os
import bs4
def main(args: argparse.Namespace) -> None:
for sgml in args.sgml:
logging.info("Input file: %s", sgml)
with open(sgml, "r") as source:
soup = bs4.BeautifulSoup(source, "lxml")
for (i, text) in enumerate(soup.find_all("text")):
sink_path = os.path.join(args.dir, f"{sgml}_{i:05d}.txt")
with open(sink_path, "w") as sink:
for p in text.find_all("p"):
contents = p.contents
assert (
len(contents) == 1
), f"Expected 1, got {len(p.contents)}"
paragraph = contents[0].strip()
paragraph = paragraph.replace("\n", " ")
print(paragraph, file=sink)
logging.info("%d documents written", i + 1)
if __name__ == "__main__":
logging.basicConfig(format="%(levelname)s: %(message)s", level="INFO")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--dir", required=True, help="output directory")
parser.add_argument("sgml", nargs="+")
main(parser.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment