Skip to content

Instantly share code, notes, and snippets.

@ryderwishart
Created July 8, 2024 20:32
Show Gist options
  • Save ryderwishart/225f6c614f7c43d7878fbe8f075dd171 to your computer and use it in GitHub Desktop.
Save ryderwishart/225f6c614f7c43d7878fbe8f075dd171 to your computer and use it in GitHub Desktop.
Statistical gloss predictions - Markov Chain Monte Carlo
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Markov Chain Monte Carlo Glossing Tool\n",
"\n",
"This approach is useful for predicting glossing pairs between tokens when you have samples of each side of the equation in your training data. It's basically a way to predict finer-grained alignments from broad sentence alignments.\n",
"\n",
"For predicting translations, we should use the sliding window technique, since that one can gracefully handle out-of-training words."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training on 31055 sentence pairs\n",
"Calculated stop words: {'le', 'de', 'vous', 'est', 'your', 'of', 'éternel', 'un', 'he', 'ne', 'a', 'd', 'for', 'i', 'que', 'you', 'dans', 'les', 'that', 'il', 'they', 'la', 'on', 'him', 'des', 's', 'the', 'je', 'à', 'to', 'and', 'qui', 'l', 'et', 'is', 'pour', 'en', 'with', 'will', 'his', 'yahweh', 'who', 'in'}\n",
"Stop words after training: {'le', 'de', 'vous', 'est', 'your', 'of', 'éternel', 'un', 'he', 'ne', 'a', 'd', 'for', 'i', 'que', 'you', 'dans', 'les', 'that', 'il', 'they', 'la', 'on', 'him', 'des', 's', 'the', 'je', 'à', 'to', 'and', 'qui', 'l', 'et', 'is', 'pour', 'en', 'with', 'will', 'his', 'yahweh', 'who', 'in'}\n",
"Number of stop words: 43\n",
"Top 10 stop words: ['le', 'de', 'vous', 'est', 'your', 'of', 'éternel', 'un', 'he', 'ne']\n",
"EASY\n",
"\n",
"Source: God created the heaven and the earth.\n",
"Target: Dieu créa les cieux et la planète.\n",
"\n",
"Glosses:\n",
"god:\n",
" - dieu: 1.000000\n",
"created:\n",
" - créa: 1.000000\n",
"heaven:\n",
" - cieux: 0.082352\n",
" - dieu: 0.010777\n",
"earth:\n",
" - planète: 1.000000\n",
"HARD\n",
"\n",
"Source: The Lord is my shepherd, I shall not want.\n",
"Target: Le Seigneur est mon berger, je ne veux pas lui.\n",
"\n",
"Glosses:\n",
"lord:\n",
" - seigneur: 0.219909\n",
" - mon: 0.050400\n",
" - pas: 0.022199\n",
"my:\n",
" - mon: 0.481298\n",
" - seigneur: 0.071360\n",
" - pas: 0.070998\n",
"shepherd:\n",
" - berger: 0.024712\n",
" - seigneur: 0.017098\n",
" - lui: 0.004475\n",
"shall:\n",
" - pas: 0.055236\n",
" - lui: 0.044041\n",
" - mon: 0.027454\n",
"not:\n",
" - pas: 0.133876\n",
" - lui: 0.066574\n",
" - mon: 0.031476\n",
"want:\n",
" - lui: 0.008095\n",
" - pas: 0.005516\n",
" - seigneur: 0.001859\n",
"\n",
"Predicting glosses for source sentence:\n",
"god: dieu\n",
"created: créa\n",
"heaven: refusas, fournis, manchot\n",
"earth: terre, planète\n",
"\n",
"Predicting glosses for target sentence:\n",
"dieu: god\n",
"créa: created\n",
"cieux: heavens\n",
"terre: earth\n",
"\n",
"Generating wooden translations:\n",
"Source to Target: dieu créa refusas terre\n",
"Target to Source: god created heavens earth\n"
]
}
],
"source": [
"from collections import defaultdict, Counter\n",
"import re\n",
"import requests\n",
"import os\n",
"import math\n",
"import numpy as np\n",
"from typing import List, Tuple, Dict, Set\n",
"\n",
"def download_corpus(url, filename):\n",
" if not os.path.exists(filename):\n",
" print(f\"Downloading {filename}...\")\n",
" response = requests.get(url)\n",
" with open(filename, 'w', encoding='utf-8') as f:\n",
" f.write(response.text)\n",
" \n",
" with open(filename, 'r', encoding='utf-8') as f:\n",
" return f.read().split('\\n')\n",
"\n",
"class ImprovedStatisticalGlosser:\n",
" def __init__(self):\n",
" self.co_occurrences = defaultdict(lambda: defaultdict(int))\n",
" self.source_counts = defaultdict(int)\n",
" self.target_counts = defaultdict(int)\n",
" self.source_doc_freq = defaultdict(int)\n",
" self.target_doc_freq = defaultdict(int)\n",
" self.total_docs = 0\n",
" self.stop_words: Set[str] = set()\n",
" self.known_glosses: Dict[str, List[str]] = {}\n",
"\n",
" def train(self, source_sentences: List[str], target_sentences: List[str]):\n",
" # Calculate stop words before training\n",
" self.calculate_stop_words(source_sentences + target_sentences)\n",
"\n",
" self.total_docs = len(source_sentences)\n",
" for source, target in zip(source_sentences, target_sentences):\n",
" source_tokens = self.tokenize(source)\n",
" target_tokens = self.tokenize(target)\n",
" \n",
" # if 'heavens' in source_tokens and 'cieux' in target_tokens:\n",
" # print(f\"Found 'heavens' and 'cieux' in: {source} | {target}\")\n",
" \n",
" source_set = set(source_tokens)\n",
" target_set = set(target_tokens)\n",
" \n",
" for s_token in source_tokens:\n",
" for t_token in target_tokens:\n",
" self.co_occurrences[s_token][t_token] += 1\n",
" self.source_counts[s_token] += 1\n",
" \n",
" for t_token in target_tokens:\n",
" self.target_counts[t_token] += 1\n",
" \n",
" for s_token in source_set:\n",
" self.source_doc_freq[s_token] += 1\n",
" for t_token in target_set:\n",
" self.target_doc_freq[t_token] += 1\n",
"\n",
" # print(f\"Co-occurrence count for 'heavens' and 'cieux': {self.co_occurrences['heavens']['cieux']}\")\n",
" # print(f\"Source count for 'heavens': {self.source_counts['heavens']}\")\n",
" # print(f\"Target count for 'cieux': {self.target_counts['cieux']}\")\n",
"\n",
" def calculate_stop_words(self, sentences: List[str], max_stop_words: int = 75):\n",
" word_counts = Counter(word for sentence in sentences for word in self.tokenize_raw(sentence))\n",
" total_words = sum(word_counts.values())\n",
" total_sentences = len(sentences)\n",
" \n",
" # Consider words appearing in more than 10% of sentences as stop words\n",
" stop_words = {word for word, count in word_counts.items() if count / total_sentences > 0.1}\n",
" \n",
" # Ensure we don't exceed max_stop_words\n",
" sorted_stop_words = sorted(stop_words, key=word_counts.get, reverse=True)\n",
" self.stop_words = set(sorted_stop_words[:max_stop_words])\n",
" print(f\"Calculated stop words: {self.stop_words}\") # Debugging line\n",
"\n",
" def add_known_glosses(self, known_glosses: Dict[str, List[str]]):\n",
" self.known_glosses = known_glosses\n",
"\n",
" def gloss(self, source_sentence, target_sentence):\n",
" source_tokens = self.tokenize(source_sentence)\n",
" target_tokens = self.tokenize(target_sentence)\n",
" \n",
" mappings = []\n",
" \n",
" for i, s_token in enumerate(source_tokens):\n",
" # if s_token == 'heavens':\n",
" # print(f\"Processing 'heavens' in gloss method\")\n",
" # print(f\"Known glosses for 'heavens': {self.known_glosses.get('heavens', [])}\")\n",
" \n",
" token_mappings = []\n",
" \n",
" # Check if there are known glosses for this token\n",
" if s_token in self.known_glosses:\n",
" known_targets = self.known_glosses[s_token]\n",
" for t_token in known_targets:\n",
" if t_token in target_tokens:\n",
" score = 1.0 # Assign a high score to known glosses\n",
" token_mappings.append((t_token, score))\n",
" \n",
" # If no known glosses were found, proceed with statistical glossing\n",
" if not token_mappings:\n",
" for j, t_token in enumerate(target_tokens):\n",
" score = self.calculate_score(s_token, t_token, i, j, len(source_tokens), len(target_tokens))\n",
" if score > 0:\n",
" token_mappings.append((t_token, score))\n",
" \n",
" token_mappings.sort(key=lambda x: x[1], reverse=True)\n",
" mappings.append((s_token, token_mappings))\n",
" \n",
" return mappings\n",
"\n",
" def calculate_score(self, source_token, target_token, source_pos, target_pos, source_len, target_len):\n",
" # if source_token == 'heavens' and target_token == 'cieux':\n",
" # print(f\"Calculating score for 'heavens' and 'cieux'\")\n",
" # print(f\"Co-occurrence: {self.co_occurrences[source_token][target_token]}\")\n",
" # print(f\"Source count: {self.source_counts[source_token]}\")\n",
" # print(f\"Target count: {self.target_counts[target_token]}\")\n",
" # print(f\"Source IDF: {math.log(self.total_docs / (self.source_doc_freq[source_token] + 1))}\")\n",
" # print(f\"Target IDF: {math.log(self.total_docs / (self.target_doc_freq[target_token] + 1))}\")\n",
" \n",
" co_occur = self.co_occurrences[source_token][target_token]\n",
" if co_occur == 0:\n",
" return 0\n",
" \n",
" source_count = self.source_counts[source_token]\n",
" target_count = self.target_counts[target_token]\n",
" \n",
" source_idf = math.log(self.total_docs / (self.source_doc_freq[source_token] + 1))\n",
" target_idf = math.log(self.total_docs / (self.target_doc_freq[target_token] + 1))\n",
" \n",
" tfidf_score = (co_occur / source_count) * source_idf * (co_occur / target_count) * target_idf\n",
" \n",
" position_score = 1 - abs((source_pos / source_len) - (target_pos / target_len))\n",
" \n",
" return tfidf_score * position_score\n",
"\n",
" def tokenize(self, sentence: str) -> List[str]:\n",
" tokens = re.findall(r'\\w+', sentence.lower())\n",
" filtered_tokens = [token for token in tokens if token not in self.stop_words]\n",
" # print(f\"Original tokens: {tokens}\") # Debugging line\n",
" # print(f\"Filtered tokens: {filtered_tokens}\") # Debugging line\n",
" return filtered_tokens\n",
"\n",
" @staticmethod\n",
" def tokenize_raw(sentence: str) -> List[str]:\n",
" return re.findall(r'\\w+', sentence.lower())\n",
"\n",
" def get_target_vocabulary(self) -> List[str]:\n",
" \"\"\"Return a list of all unique words in the target corpus.\"\"\"\n",
" return list(self.target_counts.keys())\n",
"\n",
" def predict_gloss_for_word(self, target_word: str, top_n: int = 3) -> List[Tuple[str, float]]:\n",
" \"\"\"Predict glosses for a single target word.\"\"\"\n",
" glosses = []\n",
" for source_word in self.source_counts.keys():\n",
" score = self.calculate_score(source_word, target_word, 0, 0, 1, 1)\n",
" if score > 0:\n",
" glosses.append((source_word, score))\n",
" \n",
" return sorted(glosses, key=lambda x: x[1], reverse=True)[:top_n]\n",
"\n",
" def predict_glosses_for_vocabulary(self, batch_size: int = 100) -> Dict[str, List[Tuple[str, float]]]:\n",
" \"\"\"Predict glosses for all words in the target vocabulary.\"\"\"\n",
" target_vocab = self.get_target_vocabulary()\n",
" all_glosses = {}\n",
"\n",
" for i in range(0, len(target_vocab), batch_size):\n",
" batch = target_vocab[i:i+batch_size]\n",
" for word in batch:\n",
" all_glosses[word] = self.predict_gloss_for_word(word)\n",
" \n",
" # Progress update\n",
" print(f\"Processed {min(i+batch_size, len(target_vocab))}/{len(target_vocab)} words\")\n",
"\n",
" return all_glosses\n",
"\n",
" def predict_sentence_glosses(self, sentence: str, is_source: bool = True, top_n: int = 1) -> List[List[str]]:\n",
" \"\"\"\n",
" Predict the most likely glosses for each word in the input sentence.\n",
" \n",
" Args:\n",
" sentence (str): The input sentence to gloss.\n",
" is_source (bool): If True, treat the input as a source language sentence.\n",
" If False, treat it as a target language sentence.\n",
" top_n (int): Number of top glosses to return for each word.\n",
" \n",
" Returns:\n",
" List[List[str]]: A list of lists, where each inner list contains the top_n\n",
" glosses for the corresponding word in the input sentence.\n",
" \"\"\"\n",
" tokens = self.tokenize(sentence)\n",
" sentence_glosses = []\n",
"\n",
" for token in tokens:\n",
" if is_source:\n",
" glosses = self.predict_gloss_for_source_word(token, top_n)\n",
" else:\n",
" glosses = self.predict_gloss_for_target_word(token, top_n)\n",
" sentence_glosses.append([gloss for gloss, _ in glosses])\n",
"\n",
" return sentence_glosses\n",
"\n",
" def predict_gloss_for_source_word(self, source_word: str, top_n: int = 1) -> List[Tuple[str, float]]:\n",
" if source_word in self.stop_words:\n",
" return []\n",
" \n",
" # Check known glosses first\n",
" if source_word in self.known_glosses:\n",
" return [(gloss, 1.0) for gloss in self.known_glosses[source_word]][:top_n]\n",
" \n",
" glosses = []\n",
" for target_word in self.target_counts.keys():\n",
" if target_word not in self.stop_words:\n",
" score = self.calculate_score(source_word, target_word, 0, 0, 1, 1)\n",
" if score > 0:\n",
" glosses.append((target_word, score))\n",
" return sorted(glosses, key=lambda x: x[1], reverse=True)[:top_n]\n",
"\n",
" def predict_gloss_for_target_word(self, target_word: str, top_n: int = 1) -> List[Tuple[str, float]]:\n",
" if target_word in self.stop_words:\n",
" return []\n",
" \n",
" # Check known glosses first\n",
" for source_word, target_words in self.known_glosses.items():\n",
" if target_word in target_words:\n",
" return [(source_word, 1.0)][:top_n]\n",
" \n",
" glosses = []\n",
" for source_word in self.source_counts.keys():\n",
" if source_word not in self.stop_words:\n",
" score = self.calculate_score(source_word, target_word, 0, 0, 1, 1)\n",
" if score > 0:\n",
" glosses.append((source_word, score))\n",
" return sorted(glosses, key=lambda x: x[1], reverse=True)[:top_n]\n",
"\n",
" def generate_wooden_translation(self, sentence: str, is_source: bool = True) -> str:\n",
" \"\"\"\n",
" Generate a wooden back-translation for the input sentence.\n",
" \n",
" Args:\n",
" sentence (str): The input sentence to translate.\n",
" is_source (bool): If True, translate from source to target language.\n",
" If False, translate from target to source language.\n",
" \n",
" Returns:\n",
" str: The wooden back-translation of the input sentence.\n",
" \"\"\"\n",
" tokens = self.tokenize(sentence)\n",
" glosses = self.predict_sentence_glosses(' '.join(tokens), is_source)\n",
" translated_words = [gloss_list[0] if gloss_list else '[UNK]' for gloss_list in glosses]\n",
" return ' '.join(translated_words)\n",
"\n",
"# Test the implementation\n",
"if __name__ == \"__main__\":\n",
" # Download and preprocess the corpora\n",
" french_url = \"https://github.com/BibleNLP/ebible/raw/main/corpus/fra-fraLSG.txt\"\n",
" english_url = \"https://github.com/BibleNLP/ebible/raw/main/corpus/eng-eng-web.txt\"\n",
"\n",
" french_corpus = download_corpus(french_url, \"french_corpus.txt\")\n",
" english_corpus = download_corpus(english_url, \"english_corpus.txt\")\n",
"\n",
" # Remove empty lines and ensure corpora are aligned\n",
" french_corpus = [line for line in french_corpus if line.strip()]\n",
" english_corpus = [line for line in english_corpus if line.strip()]\n",
" min_length = min(len(french_corpus), len(english_corpus))\n",
" french_corpus = french_corpus[:min_length]\n",
" english_corpus = english_corpus[:min_length]\n",
"\n",
" print(f\"Training on {len(french_corpus)} sentence pairs\")\n",
"\n",
" # Create and train the glosser\n",
" glosser = ImprovedStatisticalGlosser()\n",
" glosser.train(english_corpus, french_corpus)\n",
"\n",
" print(f\"Stop words after training: {glosser.stop_words}\")\n",
"\n",
" print(f\"Number of stop words: {len(glosser.stop_words)}\")\n",
" print(f\"Top 10 stop words: {list(glosser.stop_words)[:10]}\")\n",
"\n",
" # Add known glosses\n",
" known_glosses = {\n",
" \"god\": [\"dieu\"],\n",
" \"created\": [\"créa\"],\n",
" \"heavens\": [\"cieux\"],\n",
" \"earth\": [\"terre\", \"planète\"],\n",
" }\n",
" glosser.add_known_glosses(known_glosses)\n",
"\n",
" # Test sentence pair (easier)\n",
" test_source = \"God created the heavens and the earth.\"\n",
" test_target = \"Dieu créa les cieux et la planète.\"\n",
" \n",
" # Test sentence pair (harder)\n",
" test_source_hard = \"The Lord is my shepherd, I shall not want.\"\n",
" test_target_hard = \"Le Seigneur est mon berger, je ne veux pas lui.\"\n",
"\n",
" # Get glosses\n",
" glosses = glosser.gloss(test_source, test_target)\n",
" glosses_hard = glosser.gloss(test_source_hard, test_target_hard)\n",
"\n",
" # Print results\n",
" print(\"EASY\")\n",
" print(f\"\\nSource: {test_source}\")\n",
" print(f\"Target: {test_target}\")\n",
" print(\"\\nGlosses:\")\n",
" for source_token, mappings in glosses:\n",
" print(f\"{source_token}:\")\n",
" for target_token, score in mappings[:3]: # Limit to top 3 mappings\n",
" print(f\" - {target_token}: {score:.6f}\")\n",
"\n",
" print(\"HARD\")\n",
" print(f\"\\nSource: {test_source_hard}\")\n",
" print(f\"Target: {test_target_hard}\")\n",
" print(\"\\nGlosses:\")\n",
" for source_token, mappings in glosses_hard:\n",
" print(f\"{source_token}:\")\n",
" for target_token, score in mappings[:3]: # Limit to top 3 mappings\n",
" print(f\" - {target_token}: {score:.6f}\")\n",
"\n",
" # Test the new methods\n",
" source_sentence = \"God created the heavens and the earth.\"\n",
" target_sentence = \"Dieu créa les cieux et la terre.\"\n",
"\n",
" print(\"\\nPredicting glosses for source sentence:\")\n",
" source_glosses = glosser.predict_sentence_glosses(source_sentence, is_source=True, top_n=3)\n",
" for word, glosses in zip(glosser.tokenize(source_sentence), source_glosses):\n",
" print(f\"{word}: {', '.join(glosses)}\")\n",
"\n",
" print(\"\\nPredicting glosses for target sentence:\")\n",
" target_glosses = glosser.predict_sentence_glosses(target_sentence, is_source=False, top_n=3)\n",
" for word, glosses in zip(glosser.tokenize(target_sentence), target_glosses):\n",
" print(f\"{word}: {', '.join(glosses)}\")\n",
"\n",
" print(\"\\nGenerating wooden translations:\")\n",
" print(f\"Source to Target: {glosser.generate_wooden_translation(source_sentence, is_source=True)}\")\n",
" print(f\"Target to Source: {glosser.generate_wooden_translation(target_sentence, is_source=False)}\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment