ryderwishart/mp-scoring.ipynb

## mp-scoring.ipynb
{
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# Monolingual Likelihood and Bi-Lingual Likelihood Scoring\n",
                "\n",
                "For this scoring metric, we need \n",
                "\n",
                "- [x] a monolingual, unstructured corpus\n",
                "- [x] a bilingual, translation-pair corpus\n",
                "- [x] a hypothesis translation pair not in the bilingual corpus\n",
                "- [x] a tokenization function\n",
                "- a way to calculate the likelihood of a sequence in the monolingual corpus\n",
                "- a way to calculate the likelihood of a sequence-pair in the bilingual corpus\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 30,
            "metadata": {},
            "outputs": [],
            "source": [
                "\n",
                "# Imports\n",
                "\n",
                "import math \n",
                "import random\n",
                "import numpy as np\n",
                "from collections import defaultdict\n",
                "import gzip\n",
                "import torch\n",
                "import torch.nn as nn\n",
                "import os\n",
                "from sklearn.model_selection import train_test_split\n",
                "import requests\n",
                "from tqdm import tqdm\n",
                "from genetok.tokenizer import GeneticTokenizer\n",
                "import timeit\n",
                "import itertools\n",
                "from scipy import stats\n",
                "import pandas as pd"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## Corpus Creation\n",
                "\n",
                "We will use a French Bible as the source, and an English Bible as the target. \n",
                "\n",
                "We will hold back 10% of the data (the same lines from both verse-aligned Bibles) for evaluation. "
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 31,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Training set size: 27949 pairs\n",
                        "Test set size: 3106 pairs\n"
                    ]
                }
            ],
            "source": [
                "def download_corpus(url, filename):\n",
                "    if not os.path.exists(filename):\n",
                "        print(f\"Downloading {filename}...\")\n",
                "        response = requests.get(url)\n",
                "        with open(filename, 'w', encoding='utf-8') as f:\n",
                "            f.write(response.text)\n",
                "    \n",
                "    with open(filename, 'r', encoding='utf-8') as f:\n",
                "        return f.read().split('\\n')\n",
                "\n",
                "# Download and preprocess the corpora\n",
                "french_url = \"https://github.com/BibleNLP/ebible/raw/main/corpus/fra-fraLSG.txt\"\n",
                "english_url = \"https://github.com/BibleNLP/ebible/raw/main/corpus/eng-eng-web.txt\"\n",
                "\n",
                "french_corpus = download_corpus(french_url, \"french_corpus.txt\")\n",
                "english_corpus = download_corpus(english_url, \"english_corpus.txt\")\n",
                "\n",
                "# Remove empty lines and ensure corpora are aligned\n",
                "french_corpus = [line for line in french_corpus if line.strip()]\n",
                "english_corpus = [line for line in english_corpus if line.strip()]\n",
                "min_length = min(len(french_corpus), len(english_corpus))\n",
                "french_corpus = french_corpus[:min_length]\n",
                "english_corpus = english_corpus[:min_length]\n",
                "\n",
                "# Create test and train sets\n",
                "train_french, test_french, train_english, test_english = train_test_split(\n",
                "    french_corpus, english_corpus, test_size=0.1, random_state=42\n",
                ")\n",
                "\n",
                "print(f\"Training set size: {len(train_french)} pairs\")\n",
                "print(f\"Test set size: {len(test_french)} pairs\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 32,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Processing corpus...\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "Processing corpus: 100%|\u001b[32m██████████\u001b[0m| 27949/27949 [00:00<00:00, 2091315.56it/s]\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Training tokenizer...\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "Evolving Tokenizer: 100%|██████████| 364/364 [00:00<00:00, 605.86it/s]\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Processing corpus...\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "Processing corpus: 100%|\u001b[32m██████████\u001b[0m| 27949/27949 [00:00<00:00, 2662607.09it/s]\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Training tokenizer...\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "Evolving Tokenizer: 100%|██████████| 352/352 [00:00<00:00, 417.20it/s]"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "French tokenization example:\n",
                        "[44, 2616, 2027, 1822, 794, 2778, 1926, 1930, 1677, 1714, 55, 317, 108, 409, 145, 229, 1627, 524, 533, 1813, 398, 1958, 444, 1733, 737, 535, 1736, 1166, 1631, 1965, 407, 122, 443, 2740, 1631, 1733, 737, 163, 325, 73, 1787, 1865, 928, 147, 480, 708, 214]\n",
                        "ou|ven|ez|-|vous |de c|e qui |s’e|st |pas|s|é |d|ès |les |te|mp|s a|nc|iens|; |ar |je |suis |Dieu|, et il |n’y |en a| |point d|’a|utr|e, |Je| |suis |Dieu|, et |nu|l |n’est |sembl|ab|le |à |moi|.\n",
                        "\n",
                        "English tokenization example:\n",
                        "[2190, 1561, 1631, 2343, 977, 2262, 1507, 1307, 1326, 1631, 1184, 296, 534, 1032, 1126, 1631, 1115, 1199, 1307, 464, 951, 1256, 1259, 11, 210, 1332, 2641, 22, 2761, 1586, 1397, 593, 2454, 921, 2659, 1256, 155, 120, 214]\n",
                        "I will |go| |before |you |and m|ake| the |rough| |place|s s|mo|oth|. I will| |bre|ak| the |do|or|s of |bro|n|z|e i|n p|ie|ces |and c|ut |ap|art |the |bar|s of |ir|on|.\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "\n"
                    ]
                }
            ],
            "source": [
                "# Tokenization function\n",
                "\n",
                "def train_genetic_tokenizer(corpus, min_range=1, max_range=8, max_population=10, start_population=15, families=2, step_epochs=10):\n",
                "    print(\"Processing corpus...\")\n",
                "    data = []\n",
                "    combined_text = \"\"\n",
                "    for sentence in tqdm(corpus, desc=\"Processing corpus\", colour=\"green\"):\n",
                "        combined_text += sentence\n",
                "        if len(combined_text) > 10000:\n",
                "            data.append(combined_text)\n",
                "            combined_text = \"\"\n",
                "    \n",
                "    if combined_text:  # Add any remaining text\n",
                "        data.append(combined_text)\n",
                "\n",
                "    print(\"Training tokenizer...\")\n",
                "    _tokenizer = GeneticTokenizer(min_range=min_range, max_range=max_range, \n",
                "                                  max_population=max_population, start_population=start_population, \n",
                "                                  families=families, step_epochs=step_epochs)\n",
                "    _tokenizer.evolve(data)\n",
                "    \n",
                "    return _tokenizer\n",
                "\n",
                "# Train tokenizers for both French and English\n",
                "french_tokenizer = train_genetic_tokenizer(train_french)\n",
                "english_tokenizer = train_genetic_tokenizer(train_english)\n",
                "\n",
                "# Function to tokenize a sentence\n",
                "def tokenize_sentence(sentence: str, _tokenizer: GeneticTokenizer) -> list[int]:\n",
                "    return _tokenizer.tokenize(sentence)\n",
                "\n",
                "# Example usage\n",
                "print(\"French tokenization example:\")\n",
                "french_tokens = tokenize_sentence(train_french[0], french_tokenizer)\n",
                "print(french_tokens)\n",
                "print(french_tokenizer.detokenize(french_tokens))\n",
                "\n",
                "print(\"\\nEnglish tokenization example:\")\n",
                "english_tokens = tokenize_sentence(train_english[0], english_tokenizer)\n",
                "print(english_tokens)\n",
                "print(english_tokenizer.detokenize(english_tokens))\n",
                "\n",
                "# Optionally, save the tokenizers for future use\n",
                "# french_tokenizer.save(\"french_tokenizer\")\n",
                "# english_tokenizer.save(\"english_tokenizer\")\n"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## Scoring metrics\n",
                "\n",
                "Next, we need a way to calculate the likelihood of a sequence's being found in the monolingual corpus and also the likelihood of finding a sequence-pair in the bilingual corpus.\n",
                "\n",
                "We will implement each of the following:\n",
                "\n",
                "1. N-gram Language Model:\n",
                "   - Train on your small corpus (e.g., 1000 sentences)\n",
                "   - Use n-grams (e.g., bigrams or trigrams) to calculate probability\n",
                "   - Apply smoothing techniques (e.g., Laplace or Good-Turing) to handle unseen n-grams\n",
                "   - Calculate perplexity using these probabilities\n",
                "\n",
                "2. Markov Chain Approach:\n",
                "   - Build a transition probability matrix from your corpus\n",
                "   - Calculate the likelihood of the sentence as the product of transition probabilities\n",
                "   - This is essentially a first-order n-gram model but can be more intuitive to implement\n",
                "\n",
                "3. Character-level Language Model:\n",
                "   - Train a simple character-level model (could be n-gram based or a small neural network)\n",
                "   - Generally requires less data than word-level models\n",
                "   - Calculate perplexity at the character level\n",
                "\n",
                "4. Simplified Neural Language Model:\n",
                "   - Train a small neural network (e.g., single layer LSTM) on your limited dataset\n",
                "   - Use word or subword embeddings to reduce vocabulary size\n",
                "   - Calculate perplexity using this model\n",
                "\n",
                "5. Compression-based Approach:\n",
                "   - Use a compression algorithm (e.g., gzip) on your corpus\n",
                "   - Compress the corpus with and without the target sentence\n",
                "   - Compare file sizes to estimate the information content of the sentence\n",
                "\n",
                "6. Average Word Probability:\n",
                "   - Calculate unigram probabilities from your corpus\n",
                "   - Take the geometric mean of word probabilities in the sentence\n",
                "   - This is a very simple approach but can be effective for small datasets"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 33,
            "metadata": {},
            "outputs": [],
            "source": [
                "class NGramLanguageModel:\n",
                "    \"\"\"\n",
                "    A class to represent an N-gram Language Model.\n",
                "    \n",
                "    This model uses n-grams (sequences of n words) to calculate the probability\n",
                "    of a given sequence of words. It is trained on a corpus of text and can be\n",
                "    used to estimate the likelihood of new sequences.\n",
                "    \n",
                "    Key features:\n",
                "    - Supports different n-gram sizes (e.g., bigrams, trigrams)\n",
                "    - Applies smoothing techniques to handle unseen n-grams\n",
                "    - Calculates perplexity as a measure of how well the model predicts a sample\n",
                "    \n",
                "    The model accepts a corpus and a tokenizer, then\n",
                "    builds probability distributions for token n-grams based on their frequency in\n",
                "    the corpus. These probabilities are used to score new sequences and\n",
                "    calculate their perplexity.\n",
                "    \"\"\"\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer, n: int = 2):\n",
                "        self.corpus = corpus\n",
                "        self.n = n\n",
                "        self.tokenizer = tokenizer\n",
                "        self.ngram_counts = defaultdict(int)\n",
                "        self.context_counts = defaultdict(int)\n",
                "        self.vocab = set()\n",
                "        self.train()\n",
                "\n",
                "    def train(self):\n",
                "        \"\"\"Train the model on the corpus.\"\"\"\n",
                "        for sentence in self.corpus:\n",
                "            tokens = self.tokenizer.tokenize(sentence)\n",
                "            self.vocab.update(tokens)\n",
                "            for i in range(len(tokens) - self.n + 1):\n",
                "                ngram = tuple(tokens[i:i+self.n])\n",
                "                self.ngram_counts[ngram] += 1\n",
                "                self.context_counts[ngram[:-1]] += 1\n",
                "\n",
                "    def get_probability(self, ngram: tuple) -> float:\n",
                "        \"\"\"Calculate the probability of an n-gram with Laplace smoothing.\"\"\"\n",
                "        context = ngram[:-1]\n",
                "        return (self.ngram_counts[ngram] + 1) / (self.context_counts[context] + len(self.vocab))\n",
                "\n",
                "    def score_sequence(self, sequence: str) -> float:\n",
                "        \"\"\"Calculate the log probability of a sequence.\"\"\"\n",
                "        tokens = self.tokenizer.tokenize(sequence)\n",
                "        log_probs = np.zeros(len(tokens) - self.n + 1)\n",
                "        for i in range(len(tokens) - self.n + 1):\n",
                "            ngram = tuple(tokens[i:i+self.n])\n",
                "            prob = self.get_probability(ngram)\n",
                "            log_probs[i] = np.log(max(prob, 1e-10))\n",
                "        return np.sum(log_probs)\n",
                "\n",
                "    def perplexity(self, sequence: str) -> float:\n",
                "        \"\"\"Calculate the perplexity of a sequence.\"\"\"\n",
                "        tokens = self.tokenizer.tokenize(sequence)\n",
                "        try:\n",
                "            return np.exp(-self.score_sequence(sequence) / (len(tokens) - self.n + 1))\n",
                "        except OverflowError:\n",
                "            # Handle extremely high perplexity\n",
                "            return float('inf')\n",
                "\n",
                "# start_time = timeit.default_timer()\n",
                "# french_ngram_model = NGramLanguageModel(train_french, french_tokenizer)\n",
                "# english_ngram_model = NGramLanguageModel(train_english, english_tokenizer)\n",
                "# end_time = timeit.default_timer()\n",
                "# print(f\"Time taken to train ngram models: {end_time - start_time:.2f} seconds\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 34,
            "metadata": {},
            "outputs": [],
            "source": [
                "class MarkovChainApproach:\n",
                "    \"\"\"\n",
                "    A class to represent a Markov Chain Language Model.\n",
                "    \n",
                "    This model uses a Markov Chain to calculate the probability of a given sequence\n",
                "    of words. It is trained on a corpus of text and can be used to estimate the\n",
                "    likelihood of new sequences.\n",
                "    \n",
                "    Key features:\n",
                "    - Builds a transition probability matrix from the corpus\n",
                "    - Calculates the likelihood of a sequence as the product of transition probabilities\n",
                "    - Can be implemented as a first-order n-gram model\n",
                "    - Uses Laplace smoothing to handle unseen transitions\n",
                "    \n",
                "    The model accepts a corpus and a tokenizer, then\n",
                "    builds a transition probability matrix from the corpus.\n",
                "    \"\"\"\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer):\n",
                "        self.corpus = corpus\n",
                "        self.tokenizer = tokenizer\n",
                "        self.transition_probs = defaultdict(lambda: defaultdict(float))\n",
                "        self.vocab = set()\n",
                "        self.train()\n",
                "\n",
                "    def train(self):\n",
                "        for sentence in self.corpus:\n",
                "            tokens = self.tokenizer.tokenize(sentence)\n",
                "            self.vocab.update(tokens)\n",
                "            for current, next in zip(tokens, tokens[1:]):\n",
                "                self.transition_probs[current][next] += 1\n",
                "        \n",
                "        # Normalize probabilities with Laplace smoothing\n",
                "        vocab_size = len(self.vocab)\n",
                "        for current, transitions in self.transition_probs.items():\n",
                "            total = sum(transitions.values()) + vocab_size\n",
                "            for next in self.vocab:\n",
                "                transitions[next] = (transitions[next] + 1) / total\n",
                "\n",
                "    def score_sequence(self, sequence: str) -> float:\n",
                "        tokens = self.tokenizer.tokenize(sequence)\n",
                "        log_probs = np.zeros(len(tokens) - 1)\n",
                "        for i, (current, next) in enumerate(zip(tokens, tokens[1:])):\n",
                "            prob = self.transition_probs[current][next]\n",
                "            log_probs[i] = np.log(max(prob, 1e-10))\n",
                "        return np.sum(log_probs)\n",
                "\n",
                "    def perplexity(self, sequence: str) -> float:\n",
                "        tokens = self.tokenizer.tokenize(sequence)\n",
                "        score = self.score_sequence(sequence)\n",
                "        return np.exp(-score / (len(tokens) - 1))\n",
                "\n",
                "# start_time = timeit.default_timer()\n",
                "# french_markov_model = MarkovChainApproach(french_corpus, french_tokenizer)\n",
                "# english_markov_model = MarkovChainApproach(english_corpus, english_tokenizer)\n",
                "# end_time = timeit.default_timer()\n",
                "# print(f\"Time taken to train markov model: {end_time - start_time:.2f} seconds\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 35,
            "metadata": {},
            "outputs": [],
            "source": [
                "class CharacterLevelLanguageModel:\n",
                "    \"\"\"\n",
                "    A class to represent a Character Level Language Model.\n",
                "    \n",
                "    This model uses a Markov Chain to calculate the probability of a given sequence\n",
                "    of characters. It is trained on a corpus of text and can be used to estimate the\n",
                "    likelihood of new sequences.\n",
                "    \"\"\"\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer, n: int = 3):\n",
                "        self.corpus = corpus\n",
                "        self.tokenizer = tokenizer\n",
                "        self.n = n\n",
                "        self.char_counts = {}\n",
                "        self.context_totals = {}\n",
                "        self.vocab_size = 0\n",
                "        self.smoothing_factor = 0.1\n",
                "        self.train()\n",
                "\n",
                "    def train(self):\n",
                "        all_chars = set()\n",
                "        for sentence in self.corpus:\n",
                "            chars = list(sentence)\n",
                "            all_chars.update(chars)\n",
                "            for i in range(len(chars) - self.n + 1):\n",
                "                context = tuple(chars[i:i+self.n-1])\n",
                "                next_char = chars[i+self.n-1]\n",
                "                if context not in self.char_counts:\n",
                "                    self.char_counts[context] = {}\n",
                "                self.char_counts[context][next_char] = self.char_counts[context].get(next_char, 0) + 1\n",
                "\n",
                "        self.vocab_size = len(all_chars)\n",
                "        for context, counts in self.char_counts.items():\n",
                "            self.context_totals[context] = sum(counts.values())\n",
                "\n",
                "    def score_sequence(self, sequence: str) -> float:\n",
                "        chars = np.array(list(sequence))\n",
                "        log_probs = np.zeros(len(chars) - self.n + 1)\n",
                "        \n",
                "        for i in range(len(chars) - self.n + 1):\n",
                "            context = tuple(chars[i:i+self.n-1])\n",
                "            next_char = chars[i+self.n-1]\n",
                "            \n",
                "            if context in self.char_counts:\n",
                "                count = self.char_counts[context].get(next_char, 0)\n",
                "                total = self.context_totals[context]\n",
                "            else:\n",
                "                count = 0\n",
                "                total = 0\n",
                "            \n",
                "            prob = (count + self.smoothing_factor) / (total + self.smoothing_factor * self.vocab_size)\n",
                "            log_probs[i] = np.log(max(prob, 1e-10))\n",
                "        \n",
                "        return np.sum(log_probs)\n",
                "\n",
                "    def perplexity(self, sequence: str) -> float:\n",
                "        try:\n",
                "            score = self.score_sequence(sequence)\n",
                "            return np.exp(-score / len(sequence))\n",
                "        except ValueError:\n",
                "            return float('inf')\n",
                "\n",
                "# start_time = timeit.default_timer()\n",
                "# french_character_model = CharacterLevelLanguageModel(french_corpus, french_tokenizer)\n",
                "# english_character_model = CharacterLevelLanguageModel(english_corpus, english_tokenizer)\n",
                "# end_time = timeit.default_timer()\n",
                "# print(f\"Time taken to train character models: {end_time - start_time:.2f} seconds\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 36,
            "metadata": {},
            "outputs": [],
            "source": [
                "class SimplifiedNeuralLanguageModel:\n",
                "    \"\"\"\n",
                "    A class to represent a Simplified Neural Language Model.\n",
                "    \n",
                "    This model uses a small neural network (in this case, a single layer LSTM) to calculate the probability of a given sequence\n",
                "    of words. It is trained on a corpus of text and can be used to estimate the\n",
                "    likelihood of new sequences.\n",
                "    \"\"\"\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer, embedding_dim: int = 50, hidden_dim: int = 100):\n",
                "        self.corpus = corpus\n",
                "        self.tokenizer = tokenizer\n",
                "        self.vocab = set(token for sentence in corpus for token in tokenizer.tokenize(sentence))\n",
                "        self.token_to_index = {token: i for i, token in enumerate(self.vocab)}\n",
                "        self.index_to_token = {i: token for token, i in self.token_to_index.items()}\n",
                "        \n",
                "        self.embedding_dim = embedding_dim\n",
                "        self.hidden_dim = hidden_dim\n",
                "        self.model = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)\n",
                "        self.embedding = nn.Embedding(len(self.vocab), embedding_dim)\n",
                "        self.fc = nn.Linear(hidden_dim, len(self.vocab))\n",
                "        self.criterion = nn.CrossEntropyLoss()\n",
                "        self.optimizer = torch.optim.Adam(self.model.parameters())\n",
                "\n",
                "    def train(self, epochs: int = 10):\n",
                "        for epoch in range(epochs):\n",
                "            total_loss = 0\n",
                "            for sentence in self.corpus:\n",
                "                tokens = self.tokenizer.tokenize(sentence)\n",
                "                indices = [self.token_to_index[token] for token in tokens]\n",
                "                input_tensor = torch.LongTensor(indices[:-1]).unsqueeze(0)\n",
                "                target_tensor = torch.LongTensor(indices[1:])\n",
                "\n",
                "                self.optimizer.zero_grad()\n",
                "                embedded = self.embedding(input_tensor)\n",
                "                output, _ = self.model(embedded)\n",
                "                output = self.fc(output.squeeze(0))\n",
                "                loss = self.criterion(output, target_tensor)\n",
                "                loss.backward()\n",
                "                self.optimizer.step()\n",
                "                total_loss += loss.item()\n",
                "            print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}\")\n",
                "\n",
                "    def score_sequence(self, sequence: str) -> float:\n",
                "        tokens = self.tokenizer.tokenize(sequence)\n",
                "        if '<UNK>' not in self.token_to_index:\n",
                "            self.token_to_index['<UNK>'] = len(self.token_to_index)\n",
                "            self.index_to_token[self.token_to_index['<UNK>']] = '<UNK>'\n",
                "            self.embedding = nn.Embedding(len(self.token_to_index), self.embedding_dim)\n",
                "            self.fc = nn.Linear(self.hidden_dim, len(self.token_to_index))\n",
                "\n",
                "        indices = np.array([self.token_to_index.get(token, self.token_to_index['<UNK>']) for token in tokens])\n",
                "        \n",
                "        if len(indices) < 2:\n",
                "            return np.float32(-np.inf)\n",
                "        \n",
                "        input_tensor = torch.LongTensor(indices[:-1]).unsqueeze(0)\n",
                "        target_tensor = torch.LongTensor(indices[1:])\n",
                "\n",
                "        with torch.no_grad():\n",
                "            embedded = self.embedding(input_tensor)\n",
                "            output, _ = self.model(embedded)\n",
                "            output = self.fc(output.squeeze(0))\n",
                "            loss = self.criterion(output, target_tensor)\n",
                "        \n",
                "        return -loss.item()\n",
                "\n",
                "    def perplexity(self, sequence: str) -> float:\n",
                "        score = self.score_sequence(sequence)\n",
                "        if np.isinf(score):\n",
                "            return np.float32(np.inf)\n",
                "        return np.exp(-score / max(1, len(self.tokenizer.tokenize(sequence))))\n",
                "\n",
                "# start_time = timeit.default_timer()\n",
                "# french_neural_model = SimplifiedNeuralLanguageModel(french_corpus, french_tokenizer)\n",
                "# english_neural_model = SimplifiedNeuralLanguageModel(english_corpus, english_tokenizer)\n",
                "# end_time = timeit.default_timer()\n",
                "# print(f\"Time taken to train neural models: {end_time - start_time:.2f} seconds\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 37,
            "metadata": {},
            "outputs": [],
            "source": [
                "class CompressionBasedApproach:\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer):\n",
                "        self.corpus = corpus\n",
                "        self.tokenizer = tokenizer\n",
                "        self.compressed_corpus = self.compress_corpus()\n",
                "        self.max_score = 1e10\n",
                "\n",
                "    def compress_corpus(self) -> bytes:\n",
                "        corpus_text = ' '.join(self.corpus)\n",
                "        return gzip.compress(corpus_text.encode('utf-8'))\n",
                "\n",
                "    def score_sequence(self, sequence: str) -> float:\n",
                "        corpus_with_sequence = ' '.join(self.corpus + [sequence])\n",
                "        compressed_with_sequence = gzip.compress(corpus_with_sequence.encode('utf-8'))\n",
                "        score = len(compressed_with_sequence) - len(self.compressed_corpus)\n",
                "        \n",
                "        # Use numpy's clip function for efficiency\n",
                "        return np.clip(score, -self.max_score, self.max_score)\n",
                "\n",
                "    def perplexity(self, sequence: str) -> float:\n",
                "        # Note: This is not a true perplexity, but a relative measure\n",
                "        score = self.score_sequence(sequence)\n",
                "        sequence_length = max(1, len(sequence))\n",
                "        \n",
                "        # Use numpy's exp function and handle potential overflow\n",
                "        try:\n",
                "            return np.exp(score / sequence_length)\n",
                "        except OverflowError:\n",
                "            return np.float32(np.inf)  # Return infinity for extremely unlikely sequences\n",
                "\n",
                "# Uncomment these lines when ready to use\n",
                "# start_time = timeit.default_timer()\n",
                "# french_compression_model = CompressionBasedApproach(french_corpus, french_tokenizer)\n",
                "# english_compression_model = CompressionBasedApproach(english_corpus, english_tokenizer)\n",
                "# end_time = timeit.default_timer()\n",
                "# print(f\"Time taken to train compression models: {end_time - start_time:.2f} seconds\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 38,
            "metadata": {},
            "outputs": [],
            "source": [
                "class AverageWordProbability:\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer):\n",
                "        self.corpus = corpus\n",
                "        self.tokenizer = tokenizer\n",
                "        self.word_probs = defaultdict(float)\n",
                "        self.total_words = 0\n",
                "        self.train()\n",
                "\n",
                "    def train(self):\n",
                "        for sentence in self.corpus:\n",
                "            tokens = self.tokenizer.tokenize(sentence)\n",
                "            for token in tokens:\n",
                "                self.word_probs[token] += 1\n",
                "            self.total_words += len(tokens)\n",
                "        \n",
                "        total_words_inv = 1 / self.total_words\n",
                "        for word in self.word_probs:\n",
                "            self.word_probs[word] *= total_words_inv\n",
                "\n",
                "    def score_sequence(self, sequence: str) -> float:\n",
                "        tokens = self.tokenizer.tokenize(sequence)\n",
                "        epsilon = 1e-10  # Smoothing factor\n",
                "        log_prob = np.sum([np.log(self.word_probs.get(token, epsilon) + epsilon) for token in tokens])\n",
                "        return log_prob / max(1, len(tokens))  # Avoid division by zero\n",
                "\n",
                "    def perplexity(self, sequence: str) -> float:\n",
                "        score = self.score_sequence(sequence)\n",
                "        return np.exp(-score) if np.isfinite(score) else np.inf\n",
                "\n",
                "# start_time = timeit.default_timer()\n",
                "# french_average_word_probability_model = AverageWordProbability(french_corpus, french_tokenizer)\n",
                "# english_average_word_probability_model = AverageWordProbability(english_corpus, english_tokenizer)\n",
                "# end_time = timeit.default_timer()\n",
                "# print(f\"Time taken to train average word probability models: {end_time - start_time:.2f} seconds\")\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 39,
            "metadata": {},
            "outputs": [],
            "source": [
                "hypothesis_sentence_pair_correct = (\"Je suis un homme.\", \"I am a man.\")\n",
                "hypothesis_sentence_pair_incorrect = (\"Je suis un homme.\", \"Car fruit challenge fleet sky walks update.\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 40,
            "metadata": {},
            "outputs": [],
            "source": [
                "class UnifiedLanguageScorer:\n",
                "    def __init__(self, corpus: list[str], tokenizer: GeneticTokenizer, debug_mode: bool = False):\n",
                "        print(\"Initializing UnifiedLanguageScorer\")\n",
                "        self.corpus = corpus\n",
                "        self.tokenizer = tokenizer\n",
                "        self.debug_mode = debug_mode\n",
                "        \n",
                "        # Initialize all models\n",
                "        self.models = {\n",
                "            'ngram': NGramLanguageModel(corpus, tokenizer),\n",
                "            'markov': MarkovChainApproach(corpus, tokenizer),\n",
                "            'character': CharacterLevelLanguageModel(corpus, tokenizer),\n",
                "            'neural': SimplifiedNeuralLanguageModel(corpus, tokenizer),\n",
                "            'compression': CompressionBasedApproach(corpus, tokenizer),\n",
                "            'avg_word': AverageWordProbability(corpus, tokenizer)\n",
                "        }\n",
                "\n",
                "    def score_sentence(self, sentence: str, models: list[str] = None) -> dict:\n",
                "        models = models or list(self.models.keys())\n",
                "        \n",
                "        scores = {}\n",
                "        for model_name in models:\n",
                "            if model_name not in self.models:\n",
                "                raise ValueError(f\"Unknown model: {model_name}\")\n",
                "            model = self.models[model_name]\n",
                "            start_time = timeit.default_timer()\n",
                "            try:\n",
                "                score = model.score_sequence(sentence)\n",
                "                perplexity = model.perplexity(sentence)\n",
                "            except (ValueError, OverflowError, ZeroDivisionError):\n",
                "                score, perplexity = float('-inf'), float('inf')\n",
                "            end_time = timeit.default_timer()\n",
                "            \n",
                "            scores[model_name] = {\n",
                "                'score': score,\n",
                "                'perplexity': perplexity,\n",
                "                'time': end_time - start_time\n",
                "            }\n",
                "            if self.debug_mode:\n",
                "                print(f\"Scored with {model_name} model in {end_time - start_time:.4f} seconds\")\n",
                "        \n",
                "        return scores\n",
                "\n",
                "    def train_models(self):\n",
                "        for model in self.models.values():\n",
                "            if hasattr(model, 'train'):\n",
                "                model.train()\n",
                "                \n",
                "    def factorial_analysis(self, test_sentences: list[str], true_labels: list[int], \n",
                "                           metric: str = 'perplexity') -> pd.DataFrame:\n",
                "        \"\"\"\n",
                "        Perform factorial analysis on the scoring methods.\n",
                "        \n",
                "        :param test_sentences: List of sentences to evaluate\n",
                "        :param true_labels: List of true labels (0 for negative, 1 for positive)\n",
                "        :param metric: 'score' or 'perplexity'\n",
                "        :return: DataFrame with analysis results\n",
                "        \"\"\"\n",
                "        model_names = list(self.models.keys())\n",
                "        results = []\n",
                "\n",
                "        total_combinations = sum(1 for r in range(1, len(model_names) + 1) \n",
                "                                 for _ in itertools.combinations(model_names, r))\n",
                "\n",
                "        with tqdm(total=total_combinations, desc=\"Factorial Analysis\") as pbar:\n",
                "            for r in range(1, len(model_names) + 1):\n",
                "                for combination in itertools.combinations(model_names, r):\n",
                "                    scores = [\n",
                "                        np.mean([self.score_sentence(sentence, list(combination))[model][metric] \n",
                "                                 for model in combination])\n",
                "                        for sentence in test_sentences\n",
                "                    ]\n",
                "                    \n",
                "                    correlation, _ = stats.pointbiserialr(true_labels, scores)\n",
                "                    \n",
                "                    results.append({\n",
                "                        'models': '+'.join(combination),\n",
                "                        'correlation': correlation,\n",
                "                        'num_models': len(combination)\n",
                "                    })\n",
                "                    \n",
                "                    pbar.update(1)\n",
                "\n",
                "        return pd.DataFrame(results).sort_values('correlation', ascending=False)\n",
                "\n",
                "    def stepwise_model_selection(self, test_sentences: list[str], true_labels: list[int], \n",
                "                                 metric: str = 'perplexity') -> list[str]:\n",
                "        \"\"\"\n",
                "        Perform stepwise model selection to find the best combination of scoring methods.\n",
                "        \n",
                "        :param test_sentences: List of sentences to evaluate\n",
                "        :param true_labels: List of true labels (0 for negative, 1 for positive)\n",
                "        :param metric: 'score' or 'perplexity'\n",
                "        :return: List of selected models in order of selection\n",
                "        \"\"\"\n",
                "        available_models = set(self.models.keys())\n",
                "        selected_models = []\n",
                "        best_correlation = float('-inf')\n",
                "\n",
                "        while available_models:\n",
                "            step_results = []\n",
                "            for model in available_models:\n",
                "                current_models = selected_models + [model]\n",
                "                scores = [\n",
                "                    np.mean([self.score_sentence(sentence, current_models)[m][metric] \n",
                "                             for m in current_models])\n",
                "                    for sentence in test_sentences\n",
                "                ]\n",
                "                \n",
                "                correlation, _ = stats.pointbiserialr(true_labels, scores)\n",
                "                step_results.append((model, correlation))\n",
                "            \n",
                "            best_model, new_correlation = max(step_results, key=lambda x: x[1])\n",
                "            if new_correlation > best_correlation:\n",
                "                selected_models.append(best_model)\n",
                "                available_models.remove(best_model)\n",
                "                best_correlation = new_correlation\n",
                "            else:\n",
                "                break\n",
                "\n",
                "        return selected_models"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "Now let's compare all the various language models as metrics for the likelihood of seeing a sentence in the corpus."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 41,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "Initializing UnifiedLanguageScorer\n",
                        "Initializing UnifiedLanguageScorer\n",
                        "Time taken to train language models: 12.41 seconds\n"
                    ]
                }
            ],
            "source": [
                "start_time = timeit.default_timer()\n",
                "french_language_scorer = UnifiedLanguageScorer(french_corpus, french_tokenizer)\n",
                "english_language_scorer = UnifiedLanguageScorer(english_corpus, english_tokenizer)\n",
                "end_time = timeit.default_timer()\n",
                "print(f\"Time taken to train language models: {end_time - start_time:.2f} seconds\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 42,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "French samples:\n",
                        "- Balaam répondit à l’ânesse: C’est parce que tu t’es moquée de moi; si j’avais une épée dans la main, je te tuerais à l’instant.; corrupted: avec ns le nish roue bee de la porte am, the plu lon d his e su nu de usement oie te  he sons rnelle ’ai  nt t fe in t g,  swe ait e the p nt et said mai in bloo m, an i,  ainst  rs d  Ya the sec s d que ow on  s e s as n n\n",
                        "- Elle vit de la cavalerie, des cavaliers deux à deux, Des cavaliers sur des ânes, des cavaliers sur des chameaux; Et elle était attentive, très attentive.; corrupted: ls de Jo el des a pe  tho vie met ermis, m s de  ux yeux  fering  shall  ui qu’o ildren  qua des s he wh ons da h our atr eux qui for  he ma eig e vous  montag Ju d .T ’exigean s to e que voi ruine a nu e v port nes d ls d s de la  will b orsqu’il e e said le f str nation t the ore y toe a ehol crific hat he eli\n",
                        "- Jésus, ému de compassion, étendit la main, le toucha, et dit: Je le veux, sois pur.; corrupted: l m sep thes , t maison e et l disciple e pa crifi eur  srael,  pou ig ces ies  int saient t qu es d nd,  ien t fes c l land parè on p e ha\n",
                        "\n",
                        "English samples:\n",
                        "- but Paul chose Silas and went out, being commended by the brothers to the grace of God.; corrupted: par sa  eration d, and th und ut t ildre e p ah mong t u l appa  ll  , fils d their f for th , “Wha therefor s dan ’Éter ape. He  he hi serva , et moi  ame  nt:  animal lide\n",
                        "- Praise Yah! Praise Yahweh, my soul.; corrupted: or t aa c’est  dans la ith  ll happ gh in, se na Pierre n peuple\n",
                        "- He said to me, “Write, ‘Blessed are those who are invited to the wedding supper of the Lamb.’” He said to me, “These are true words of God.”; corrupted: ds. We we gains the sur la  ern ll the m him  ble u’il  him t n fr ille other  eux. prepared n’est  lie f the ’s wife u no tes d e vous l ux qu au  me de Yah t, e qu’il mag sword  fils d r i ite rease  ce,  ieu r, ale don’t  use du s is th that\n",
                        "\n",
                        "Scoring French samples:\n",
                        "\n",
                        "Sample: Balaam répondit à l’ânesse: C’est parce que tu t’es moquée de moi; si j’avais une épée dans la main, je te tuerais à l’instant.\n",
                        "  ngram: score = -202.5449, perplexity = 111.0906\n",
                        "  markov: score = -202.5449, perplexity = 111.0906\n",
                        "  character: score = -236.2833, perplexity = 6.4269\n",
                        "  neural: score = -7.7026, perplexity = 1.1913\n",
                        "  compression: score = 66.0000, perplexity = 1.6815\n",
                        "  avg_word: score = -6.6264, perplexity = 754.7722\n",
                        "Corrupted sample: avec ns le nish roue bee de la porte am, the plu lon d his e su nu de usement oie te  he sons rnelle ’ai  nt t fe in t g,  swe ait e the p nt et said mai in bloo m, an i,  ainst  rs d  Ya the sec s d que ow on  s e s as n n\n",
                        "  ngram: score = -676.9172, perplexity = 3847.1477\n",
                        "  markov: score = -1000.7809, perplexity = 199714.6060\n",
                        "  character: score = -919.2759, perplexity = 61.7018\n",
                        "  neural: score = -7.6613, perplexity = 1.0967\n",
                        "  compression: score = 136.0000, perplexity = 1.8402\n",
                        "  avg_word: score = -8.3035, perplexity = 4038.0740\n",
                        "\n",
                        "Sample: Elle vit de la cavalerie, des cavaliers deux à deux, Des cavaliers sur des ânes, des cavaliers sur des chameaux; Et elle était attentive, très attentive.\n",
                        "  ngram: score = -260.2677, perplexity = 123.9369\n",
                        "  markov: score = -260.2677, perplexity = 123.9369\n",
                        "  character: score = -296.9385, perplexity = 6.9641\n",
                        "  neural: score = -7.6886, perplexity = 1.1500\n",
                        "  compression: score = 61.0000, perplexity = 1.4899\n",
                        "  avg_word: score = -6.7366, perplexity = 842.6634\n",
                        "Corrupted sample: ls de Jo el des a pe  tho vie met ermis, m s de  ux yeux  fering  shall  ui qu’o ildren  qua des s he wh ons da h our atr eux qui for  he ma eig e vous  montag Ju d .T ’exigean s to e que voi ruine a nu e v port nes d ls d s de la  will b orsqu’il e e said le f str nation t the ore y toe a ehol crific hat he eli\n",
                        "  ngram: score = -875.7475, perplexity = 4189.9861\n",
                        "  markov: score = -1218.1633, perplexity = 109267.6556\n",
                        "  character: score = -1239.7679, perplexity = 52.5056\n",
                        "  neural: score = -7.6551, perplexity = 1.0749\n",
                        "  compression: score = 192.0000, perplexity = 1.8467\n",
                        "  avg_word: score = -7.9173, perplexity = 2744.2302\n",
                        "\n",
                        "Sample: Jésus, ému de compassion, étendit la main, le toucha, et dit: Je le veux, sois pur.\n",
                        "  ngram: score = -133.6636, perplexity = 141.2459\n",
                        "  markov: score = -133.6636, perplexity = 141.2459\n",
                        "  character: score = -145.6896, perplexity = 5.7852\n",
                        "  neural: score = -7.6916, perplexity = 1.3161\n",
                        "  compression: score = 41.0000, perplexity = 1.6388\n",
                        "  avg_word: score = -6.8036, perplexity = 901.0511\n",
                        "Corrupted sample: l m sep thes , t maison e et l disciple e pa crifi eur  srael,  pou ig ces ies  int saient t qu es d nd,  ien t fes c l land parè on p e ha\n",
                        "  ngram: score = -412.3404, perplexity = 3815.4294\n",
                        "  markov: score = -439.2923, perplexity = 6541.0100\n",
                        "  character: score = -588.3457, perplexity = 68.9032\n",
                        "  neural: score = -7.6773, perplexity = 1.1625\n",
                        "  compression: score = 87.0000, perplexity = 1.8699\n",
                        "  avg_word: score = -6.7349, perplexity = 841.2864\n",
                        "\n",
                        "Scoring English samples:\n",
                        "\n",
                        "Sample: but Paul chose Silas and went out, being commended by the brothers to the grace of God.\n",
                        "  ngram: score = -135.3887, perplexity = 182.5923\n",
                        "  markov: score = -135.3887, perplexity = 182.5923\n",
                        "  character: score = -154.1264, perplexity = 5.8801\n",
                        "  neural: score = -7.7778, perplexity = 1.3338\n",
                        "  compression: score = 44.0000, perplexity = 1.6582\n",
                        "  avg_word: score = -6.9217, perplexity = 1014.0714\n",
                        "Corrupted sample: par sa  eration d, and th und ut t ildre e p ah mong t u l appa  ll  , fils d their f for th , “Wha therefor s dan ’Éter ape. He  he hi serva , et moi  ame  nt:  animal lide\n",
                        "  ngram: score = -448.3469, perplexity = 5552.7896\n",
                        "  markov: score = -651.6015, perplexity = 276729.9239\n",
                        "  character: score = -648.3020, perplexity = 42.4111\n",
                        "  neural: score = -7.7628, perplexity = 1.1577\n",
                        "  compression: score = 107.0000, perplexity = 1.8561\n",
                        "  avg_word: score = -8.1775, perplexity = 3559.9525\n",
                        "\n",
                        "Sample: Praise Yah! Praise Yahweh, my soul.\n",
                        "  ngram: score = -43.9530, perplexity = 81.0692\n",
                        "  markov: score = -43.9530, perplexity = 81.0692\n",
                        "  character: score = -65.0881, perplexity = 6.4215\n",
                        "  neural: score = -7.7461, perplexity = 2.0222\n",
                        "  compression: score = 20.0000, perplexity = 1.7708\n",
                        "  avg_word: score = -6.6714, perplexity = 789.5009\n",
                        "Corrupted sample: or t aa c’est  dans la ith  ll happ gh in, se na Pierre n peuple\n",
                        "  ngram: score = -170.3699, perplexity = 3337.0899\n",
                        "  markov: score = -265.4584, perplexity = 308931.4942\n",
                        "  character: score = -274.4729, perplexity = 72.8672\n",
                        "  neural: score = -7.7664, perplexity = 1.4234\n",
                        "  compression: score = 49.0000, perplexity = 2.1503\n",
                        "  avg_word: score = -9.0805, perplexity = 8782.7192\n",
                        "\n",
                        "Sample: He said to me, “Write, ‘Blessed are those who are invited to the wedding supper of the Lamb.’” He said to me, “These are true words of God.”\n",
                        "  ngram: score = -185.0043, perplexity = 91.1311\n",
                        "  markov: score = -185.0043, perplexity = 91.1311\n",
                        "  character: score = -242.3570, perplexity = 5.6470\n",
                        "  neural: score = -7.7950, perplexity = 1.2039\n",
                        "  compression: score = 4.0000, perplexity = 1.0290\n",
                        "  avg_word: score = -6.3214, perplexity = 556.3413\n",
                        "Corrupted sample: ds. We we gains the sur la  ern ll the m him  ble u’il  him t n fr ille other  eux. prepared n’est  lie f the ’s wife u no tes d e vous l ux qu au  me de Yah t, e qu’il mag sword  fils d r i ite rease  ce,  ieu r, ale don’t  use du s is th that\n",
                        "  ngram: score = -618.5657, perplexity = 5384.0160\n",
                        "  markov: score = -916.3751, perplexity = 336863.1009\n",
                        "  character: score = -955.8304, perplexity = 50.2665\n",
                        "  neural: score = -7.7737, perplexity = 1.1124\n",
                        "  compression: score = 144.0000, perplexity = 1.8043\n",
                        "  avg_word: score = -8.8439, perplexity = 6931.6510\n"
                    ]
                }
            ],
            "source": [
                "# Now let's evaluate the likelihood of seeing some of our withheld test sentences in our corpus\n",
                "# We'll use the test sets we created earlier: test_french and test_english\n",
                "\n",
                "def generate_random_noise_sample(tokenizer: GeneticTokenizer, length: int = 10) -> str:\n",
                "    # Use the tokenizer's tokens instead of vocab\n",
                "    return ' '.join(random.choice(tokenizer.tokens) for _ in range(length))\n",
                "\n",
                "# Sample a few sentences from each test set\n",
                "num_samples = 3\n",
                "french_samples = random.sample(test_french, num_samples)\n",
                "english_samples = random.sample(test_english, num_samples)\n",
                "corrupted_french_samples = [generate_random_noise_sample(french_tokenizer, len(french_tokenizer.tokenize(sample))) for sample in french_samples]\n",
                "corrupted_english_samples = [generate_random_noise_sample(english_tokenizer, len(english_tokenizer.tokenize(sample))) for sample in english_samples]\n",
                "\n",
                "print(\"French samples:\")\n",
                "for sample in french_samples:\n",
                "    print(f\"- {sample}; corrupted: {corrupted_french_samples[french_samples.index(sample)]}\")\n",
                "\n",
                "print(\"\\nEnglish samples:\")\n",
                "for sample in english_samples:\n",
                "    print(f\"- {sample}; corrupted: {corrupted_english_samples[english_samples.index(sample)]}\")\n",
                "\n",
                "\n",
                "# Now let's score these samples using our language models\n",
                "print(\"\\nScoring French samples:\")\n",
                "for sample in french_samples:\n",
                "    scores = french_language_scorer.score_sentence(sample)\n",
                "    print(f\"\\nSample: {sample}\")\n",
                "    for model, score in scores.items():\n",
                "        print(f\"  {model}: score = {score['score']:.4f}, perplexity = {score['perplexity']:.4f}\")\n",
                "    \n",
                "    try:\n",
                "        corrupted_sample = corrupted_french_samples[french_samples.index(sample)]\n",
                "        corrupted_scores = french_language_scorer.score_sentence(corrupted_sample)\n",
                "        print(f\"Corrupted sample: {corrupted_sample}\")\n",
                "        for model, score in corrupted_scores.items():\n",
                "            print(f\"  {model}: score = {score['score']:.4f}, perplexity = {score['perplexity']:.4f}\")\n",
                "    except ValueError as e:\n",
                "        print(f\"Error scoring corrupted sample: {str(e)}\")\n",
                "\n",
                "print(\"\\nScoring English samples:\")\n",
                "for sample in english_samples:\n",
                "    scores = english_language_scorer.score_sentence(sample)\n",
                "    print(f\"\\nSample: {sample}\")\n",
                "    for model, score in scores.items():\n",
                "        print(f\"  {model}: score = {score['score']:.4f}, perplexity = {score['perplexity']:.4f}\")\n",
                "    \n",
                "    try:\n",
                "        corrupted_sample = corrupted_english_samples[english_samples.index(sample)]\n",
                "        corrupted_scores = english_language_scorer.score_sentence(corrupted_sample)\n",
                "        print(f\"Corrupted sample: {corrupted_sample}\")\n",
                "        for model, score in corrupted_scores.items():\n",
                "            print(f\"  {model}: score = {score['score']:.4f}, perplexity = {score['perplexity']:.4f}\")\n",
                "    except ValueError as e:\n",
                "        print(f\"Error scoring corrupted sample: {str(e)}\")\n"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "## Factorial Analysis\n",
                "\n",
                "We want to determine the most effective combination of the implemented language models to use for scoring the naturalness of a sentence against a corpus.\n",
                "\n",
                "The `factorial_analysis` method will give you a comprehensive view of how different combinations of scoring methods perform. It calculates the correlation between the combined scores and the true labels for each possible combination of scoring methods.\n",
                "The `stepwise_model_selection` method uses a forward selection approach to iteratively build the best combination of scoring methods. It starts with no methods and adds the best performing method at each step until adding more methods no longer improves the correlation with true labels.\n",
                "These approaches will help you identify:\n",
                "\n",
                "- Which individual scoring methods are most effective.\n",
                "- Which combinations of scoring methods work best together.\n",
                "- Whether using more scoring methods always leads to better results, or if there's a point of diminishing returns."
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 43,
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "French Factorial Analysis:\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "Factorial Analysis: 100%|██████████| 63/63 [08:38<00:00,  8.23s/it]\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "                          models  correlation  num_models\n",
                        "3                         neural    -0.438102           1\n",
                        "1                         markov    -0.530718           1\n",
                        "12                 markov+neural    -0.530718           2\n",
                        "13            markov+compression    -0.530718           2\n",
                        "34     markov+neural+compression    -0.530718           3\n",
                        "..                           ...          ...         ...\n",
                        "15              character+neural    -0.674100           2\n",
                        "16         character+compression    -0.674277           2\n",
                        "37  character+neural+compression    -0.677616           3\n",
                        "18            neural+compression    -0.774828           2\n",
                        "4                    compression    -0.963998           1\n",
                        "\n",
                        "[63 rows x 3 columns]\n",
                        "\n",
                        "English Factorial Analysis:\n"
                    ]
                },
                {
                    "name": "stderr",
                    "output_type": "stream",
                    "text": [
                        "Factorial Analysis: 100%|██████████| 63/63 [09:10<00:00,  8.74s/it]\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "                                         models  correlation  num_models\n",
                        "3                                        neural     0.458478           1\n",
                        "18                           neural+compression    -0.318046           2\n",
                        "8                                  ngram+neural    -0.404863           2\n",
                        "0                                         ngram    -0.404897           1\n",
                        "28                     ngram+neural+compression    -0.404968           3\n",
                        "..                                          ...          ...         ...\n",
                        "30                   ngram+compression+avg_word    -0.782290           3\n",
                        "48              ngram+character+neural+avg_word    -0.782419           4\n",
                        "27                     ngram+character+avg_word    -0.782420           3\n",
                        "60  ngram+character+neural+compression+avg_word    -0.782420           5\n",
                        "49         ngram+character+compression+avg_word    -0.782422           4\n",
                        "\n",
                        "[63 rows x 3 columns]\n",
                        "\n",
                        "French Stepwise Model Selection:\n",
                        "Best combination of models for French: ['neural']\n",
                        "\n",
                        "English Stepwise Model Selection:\n",
                        "Best combination of models for English: ['neural']\n"
                    ]
                }
            ],
            "source": [
                "# We already initialized the scorers in the previous cells\n",
                "english_scorer = english_language_scorer\n",
                "french_scorer = french_language_scorer\n",
                "\n",
                "# Prepare test data\n",
                "french_test_sentences = french_samples + corrupted_french_samples + english_samples\n",
                "english_test_sentences = english_samples + corrupted_english_samples + french_samples\n",
                "\n",
                "# True labels: 2 for original samples, 1 for corrupted samples, 0 for wrong language\n",
                "french_true_labels = [2] * len(french_samples) + [1] * len(corrupted_french_samples) + [0] * len(english_samples)\n",
                "english_true_labels = [2] * len(english_samples) + [1] * len(corrupted_english_samples) + [0] * len(french_samples)\n",
                "\n",
                "# Perform factorial analysis\n",
                "print(\"French Factorial Analysis:\")\n",
                "french_factorial_results = french_scorer.factorial_analysis(french_test_sentences, french_true_labels)\n",
                "print(french_factorial_results)\n",
                "\n",
                "print(\"\\nEnglish Factorial Analysis:\")\n",
                "english_factorial_results = english_scorer.factorial_analysis(english_test_sentences, english_true_labels)\n",
                "print(english_factorial_results)\n",
                "\n",
                "# Perform stepwise model selection\n",
                "print(\"\\nFrench Stepwise Model Selection:\")\n",
                "french_best_models = french_scorer.stepwise_model_selection(french_test_sentences, french_true_labels)\n",
                "print(\"Best combination of models for French:\", french_best_models)\n",
                "\n",
                "print(\"\\nEnglish Stepwise Model Selection:\")\n",
                "english_best_models = english_scorer.stepwise_model_selection(english_test_sentences, english_true_labels)\n",
                "print(\"Best combination of models for English:\", english_best_models)"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.12.4"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 2
}