-
-
Save EzraBrand/a64f9dcb21b881273ae04a5fc0b30781 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python3 | |
| """ | |
| Build a concordance of Talmud toponyms from a full Talmud text and a | |
| toponym gazetteer. | |
| Requirements / Assumptions | |
| -------------------------- | |
| - Input 1: talmud_full.txt | |
| A UTF-8 plain-text file with the full Talmud text in English, where | |
| tokenization by whitespace is acceptable. | |
| - Input 2: talmud_toponyms_gazetteer.txt | |
| A UTF-8 plain-text file with one toponym per line (possibly multi-word). | |
| Output | |
| ------ | |
| - talmud_toponyms_concordance.txt | |
| For each toponym that appears in the text: | |
| Toponym: <Name> | |
| Total occurrences: <count> | |
| 1. <10 words before + toponym + 15 words after> | |
| ... | |
| Up to 5 contexts per toponym, sorted alphabetically by toponym. | |
| """ | |
| from pathlib import Path | |
| from collections import defaultdict | |
| # --------------------------------------------------------------------------- | |
| # Configuration | |
| # --------------------------------------------------------------------------- | |
| # Input file paths (adjust as needed) | |
| TALMUD_PATH = Path("talmud_full.txt") | |
| GAZETTEER_PATH = Path("talmud_toponyms_gazetteer.txt") | |
| # Output file path | |
| OUTPUT_PATH = Path("talmud_toponyms_concordance.txt") | |
| # Number of context words around each match | |
| LEFT_CONTEXT_WORDS = 10 | |
| RIGHT_CONTEXT_WORDS = 15 | |
| # Punctuation characters to strip from token edges during normalization | |
| PUNCT_CHARS = '.,;:!?()[]{}"“”‘’\'`<>/\\|+-=*_^~…—–' | |
| # --------------------------------------------------------------------------- | |
| # Normalization helpers | |
| # --------------------------------------------------------------------------- | |
| def norm_token(token: str) -> str: | |
| """ | |
| Normalize a token for matching purposes. | |
| - Strip leading/trailing punctuation characters (PUNCT_CHARS) | |
| - Lowercase | |
| """ | |
| return token.strip(PUNCT_CHARS).lower() | |
| def norm_word(word: str) -> str: | |
| """ | |
| Same as norm_token, but kept separate for clarity when normalizing | |
| gazetteer entries. | |
| """ | |
| return norm_token(word) | |
| # --------------------------------------------------------------------------- | |
| # Loading input files | |
| # --------------------------------------------------------------------------- | |
| def load_text(path: Path) -> str: | |
| """Load UTF-8 text from path.""" | |
| return path.read_text(encoding="utf-8") | |
| def load_gazetteer(path: Path): | |
| """ | |
| Load gazetteer file and return a list of raw toponym strings | |
| (one per non-empty line). | |
| """ | |
| lines = path.read_text(encoding="utf-8").splitlines() | |
| return [line.strip() for line in lines if line.strip()] | |
| # --------------------------------------------------------------------------- | |
| # Preparing tokens and toponyms | |
| # --------------------------------------------------------------------------- | |
| def tokenize_talmud(text: str): | |
| """ | |
| Tokenize Talmud text by whitespace. | |
| Returns: | |
| tokens_raw: list of original tokens (for output) | |
| tokens_norm: list of normalized tokens (for matching) | |
| """ | |
| tokens_raw = text.split() | |
| tokens_norm = [norm_token(t) for t in tokens_raw] | |
| return tokens_raw, tokens_norm | |
| def prepare_toponyms(toponyms): | |
| """ | |
| Prepare a structured representation for each toponym. | |
| Each entry is a dict: | |
| { | |
| "name": original string, | |
| "norm_words": list of normalized words, | |
| "word_len": number of words, | |
| "char_len": length of normalized phrase joined by spaces | |
| } | |
| """ | |
| data = [] | |
| for name in toponyms: | |
| words = name.split() | |
| norm_words = [norm_word(w) for w in words] | |
| norm_words = [w for w in norm_words if w] # filter any empty | |
| if not norm_words: | |
| continue | |
| data.append({ | |
| "name": name, | |
| "norm_words": norm_words, | |
| "word_len": len(norm_words), | |
| "char_len": len(" ".join(norm_words)), | |
| }) | |
| return data | |
| def build_first_token_index(toponym_data): | |
| """ | |
| Build index: normalized first token -> list of toponyms starting with it. | |
| Within each list, sort by: | |
| - word_len descending | |
| - then char_len descending | |
| - then name alphabetically | |
| This ensures longest matches are tried first at each position. | |
| """ | |
| index = defaultdict(list) | |
| for td in toponym_data: | |
| first = td["norm_words"][0] | |
| index[first].append(td) | |
| for first, lst in index.items(): | |
| lst.sort(key=lambda td: (-td["word_len"], -td["char_len"], td["name"])) | |
| return index | |
| # --------------------------------------------------------------------------- | |
| # Matching toponyms in the Talmud | |
| # --------------------------------------------------------------------------- | |
| def find_toponym_occurrences(tokens_norm, index_by_first, toponym_data): | |
| """ | |
| Scan through normalized tokens and record non-overlapping occurrences of | |
| toponyms. | |
| Strategy: | |
| - Walk linearly through tokens_norm. | |
| - At position i, look up candidates whose first word matches tokens_norm[i]. | |
| - For each candidate toponym (longest first), check: | |
| * All tokens in the span are unused | |
| * tokens_norm[i:i+L] matches the toponym's norm_words | |
| If so, record an occurrence and mark those tokens as used. | |
| - Move to next position. | |
| Returns: | |
| occurrences: dict name -> list of (start_index, word_len) | |
| """ | |
| n = len(tokens_norm) | |
| used = [False] * n | |
| occurrences = defaultdict(list) | |
| for i in range(n): | |
| if used[i]: | |
| continue | |
| tok = tokens_norm[i] | |
| if not tok or tok not in index_by_first: | |
| continue | |
| candidates = index_by_first[tok] | |
| for td in candidates: | |
| L = td["word_len"] | |
| if i + L > n: | |
| continue | |
| # Ensure span is not already used | |
| if any(used[i + k] for k in range(L)): | |
| continue | |
| # Check for exact normalized match | |
| if tokens_norm[i:i + L] == td["norm_words"]: | |
| name = td["name"] | |
| occurrences[name].append((i, L)) | |
| for k in range(L): | |
| used[i + k] = True | |
| break # do not try shorter toponyms at this position | |
| return occurrences | |
| # --------------------------------------------------------------------------- | |
| # Building context snippets | |
| # --------------------------------------------------------------------------- | |
| def build_context(tokens_raw, start_idx, length, | |
| left_words=LEFT_CONTEXT_WORDS, | |
| right_words=RIGHT_CONTEXT_WORDS): | |
| """ | |
| Build a context snippet of raw tokens around a match. | |
| Includes: | |
| - up to left_words before the first token of the match | |
| - the matched span | |
| - up to right_words after the last token of the match | |
| """ | |
| n = len(tokens_raw) | |
| end_idx = start_idx + length - 1 | |
| left_start = max(0, start_idx - left_words) | |
| right_end = min(n - 1, end_idx + right_words) | |
| snippet_tokens = tokens_raw[left_start:right_end + 1] | |
| return " ".join(snippet_tokens) | |
| # --------------------------------------------------------------------------- | |
| # Writing the concordance | |
| # --------------------------------------------------------------------------- | |
| def write_concordance(output_path: Path, tokens_raw, occurrences): | |
| """ | |
| Write concordance as a plain-text file. | |
| For each toponym with at least one occurrence: | |
| - Sorted alphabetically by name (case-insensitive) | |
| - Output total number of occurrences | |
| - Then up to five contexts, in ascending order of position. | |
| """ | |
| lines = [] | |
| # Sort toponyms alphabetically by name (case-insensitive) | |
| for name in sorted(occurrences.keys(), key=lambda s: s.lower()): | |
| occs = occurrences[name] | |
| total_count = len(occs) | |
| lines.append(f"Toponym: {name}") | |
| lines.append(f"Total occurrences: {total_count}") | |
| # Sort occurrences by position in text | |
| occs_sorted = sorted(occs, key=lambda x: x[0]) | |
| for idx, (start_idx, L) in enumerate(occs_sorted[:5], start=1): | |
| context = build_context(tokens_raw, start_idx, L) | |
| lines.append(f"{idx}. {context}") | |
| # Blank line between entries | |
| lines.append("") | |
| output_text = "\n".join(lines) | |
| output_path.write_text(output_text, encoding="utf-8") | |
| # --------------------------------------------------------------------------- | |
| # Main | |
| # --------------------------------------------------------------------------- | |
| def main(): | |
| # Load input files | |
| talmud_text = load_text(TALMUD_PATH) | |
| gazetteer = load_gazetteer(GAZETTEER_PATH) | |
| # Tokenize Talmud | |
| tokens_raw, tokens_norm = tokenize_talmud(talmud_text) | |
| # Prepare toponyms and index | |
| toponym_data = prepare_toponyms(gazetteer) | |
| index_by_first = build_first_token_index(toponym_data) | |
| # Find occurrences | |
| occurrences = find_toponym_occurrences(tokens_norm, index_by_first, toponym_data) | |
| # Write concordance | |
| write_concordance(OUTPUT_PATH, tokens_raw, occurrences) | |
| if __name__ == "__main__": | |
| main() |
Script created and run by chatgpt5:
https://chatgpt.com/share/693be079-4f28-8012-8c54-7954a658b5a9
original prompt:
""
attached are two text files. I want you to create a concordance of talmud toponyms in the talmud. concordance should output up to 5 results for each toponym. each result should have 10 words before, and 15 words after the toponym. also, count the number of times each toponym appears in the talmud, and that count should be noted for each entry, before appearances. when iterating through, sort longest to shortest, so as not to double count. but in output, sort alphabetically. use tokenization, if more efficient. output in text file
""
updated prompt (for bible names):
"""
attached are two text files. I want you to create a concordance of bible names in the talmud. concordance should output up to 5 results for each entry. each result should have 10 words before, and 20 words after the entry. also, count the number of times each entry appears in the talmud, and that count should be noted for each entry, before appearances. when iterating through, sort longest to shortest, so as not to double count. but in output, sort alphabetically. use tokenization, if more efficient. output in text file.
final output should be two new files:
- concordance text file
- csv file with two cols: entry, and count
"""
Output file (formatted) here:
https://www.academia.edu/145382795/Concordance_of_Talmudic_Toponyms