Skip to content

Instantly share code, notes, and snippets.

@andjc
Created May 22, 2024 07:05
Show Gist options
  • Save andjc/156c702685b10de7921de67f3f81c0a0 to your computer and use it in GitHub Desktop.
Save andjc/156c702685b10de7921de67f3f81c0a0 to your computer and use it in GitHub Desktop.
MARC-8 and EACC
@andjc
Copy link
Author

andjc commented May 22, 2024

To get lists of MARC-8 and EACC characters:

import requests
import pandas as pd

url =  "https://www.loc.gov/marc/specifications/codetables.xml"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
response = requests.get(url, headers=headers)
xml_contents = response.content

# files = ['marc8', 'eacc']
files = ['eacc']

def generate_data(generate_files):

    if 'marc8' in generate_files:
        print("Generating MARC-8 character list")
        marc8 = pd.read_xml(xml_contents, xpath='/codeTables/codeTable/characterSet/*')
        marc8.drop([663, 664, 665, 666, 667], axis=0, inplace = True)
        marc8.dropna(subset=['ucs'], inplace=True)
        marc8_set = set(marc8['ucs'])
        marc8_set.update(('0361', 'FE20', 'FE21', '0360', 'FE22', 'FE23'))
        marc8_chars = [chr(int(h, 16)) for h in marc8_set]
    else:
        marc8_chars = None

    if 'eacc' in generate_files:
        print("Generating EACC character list")
        eacc = pd.read_xml(xml_contents, xpath='/codeTables/codeTable[@name="East Asian"]/characterSet/grouping/*')
        eacc.drop(index=eacc.index[0], axis=0, inplace=True)
        eacc.dropna(subset=['ucs'], inplace=True)
        eacc.drop('p', axis=1, inplace=True)
        eacc_set = set(eacc['ucs'])
        eacc_set.update(('3013'))
        eacc_chars = [chr(int(h, 16)) for h in eacc_set]
    else:
        eacc_chars = None

    return (marc8_chars, eacc_chars)

marc8, eacc = generate_data(files)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment