Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save niconoe/b5405baa06e00d1695dfa64ebc6b385b to your computer and use it in GitHub Desktop.
Save niconoe/b5405baa06e00d1695dfa64ebc6b385b to your computer and use it in GitHub Desktop.
from dwca.read import DwCAReader
import os
import psutil
pid = os.getpid()
py = psutil.Process(pid)
# Should have the same content than your data file (that I had trouble downloading for some reason)
DWCA_PATH = '0047730-200613084148143.zip'
print(f'memory use before: {(py.memory_info()[0] / 2. ** 30):.2f} Gb')
# Skipping multimedia.txt (if you don't use it) will slightly improve performance
dwca = DwCAReader(DWCA_PATH, extensions_to_ignore=["multimedia.txt"])
for i, core_row in enumerate(dwca):
print(f'memory use in loop: {(py.memory_info()[0] / 2. ** 30):.2f} Gb', flush=True)
# First iteration takes a long time (it's parsing all the extension file), but the result is cached => following
# iterations are MUCH faster
interpreted_data = core_row.data
verbatim_data = None
extensions = [e.rowtype for e in core_row.extensions]
try:
i = extensions.index('http://rs.tdwg.org/dwc/terms/Occurrence')
verbatim_data = core_row.extensions[i].data
except ValueError:
pass # No verbatim data for this record
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment