Skip to content

Instantly share code, notes, and snippets.

@rshewitt
Last active August 22, 2024 02:26
Show Gist options
  • Save rshewitt/13a37d46f1fd0f77b80ca94f806f204c to your computer and use it in GitHub Desktop.
Save rshewitt/13a37d46f1fd0f77b80ca94f806f204c to your computer and use it in GitHub Desktop.
datagov catalog xml schema document counter
import requests
import re
import xml.etree.ElementTree as ET
def main():
with open("result.2.txt") as f:
text_data = list(map(str.strip, f.readlines()[9:]))
output = {}
bad = []
fgdc_versions = set([])
c = 0
for i in range(0, len(text_data), 2):
num_datasets = int(re.findall(r"\d+", text_data[i])[1])
doc = text_data[i + 1]
url = re.search("http[s]*\S+", doc).group(0)
if "FGDC" not in doc:
continue
c += 1
try:
resp = requests.get(url).content
root = ET.fromstring(resp)
root_tag = root.tag
fgdc_version = identify_fgdc_version(root)
if fgdc_version is not None:
fgdc_versions.add(fgdc_version)
output[root] = output.get(root, 0) + num_datasets
except:
bad.append(i)
# for k, v in output.items():
# print(k, v)
print(fgdc_versions)
print(c)
# print(bad)
return
def identify_fgdc_version(root):
v = root.find("./metainfo//metstdv")
if v is not None:
return v.text
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment