tzekid/contactions.ipynb

## contactions.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import contractions\n",
    "import re\n",
    "import difflib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "text = '''I'd been I'd be sangled in a day\n",
    "this Now I'd caught some of the things  now\n",
    "Yeah No it's a good concept  I inoduced it\n",
    "'''\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_CASES = [\n",
    "    # \"this'd be the best thing in my life\"\n",
    "]\n",
    "import re\n",
    "\n",
    "# read antconc_results_ICE-GB.txt into TEST_CASES list\n",
    "with open(\"./nlp-contraptions/sanitized_input.txt\", \"r\") as stream:\n",
    "    for line in stream:\n",
    "        TEST_CASES.append(line.strip())\n",
    "\n",
    "# remove angle brackets and their contents from TEST_CASES list of strings\n",
    "TEST_CASES = [re.sub(r'<.*?>', '', sent) for sent in TEST_CASES]\n",
    "\n",
    "# remove special characters from TEST_CASES list of strings\n",
    "TEST_CASES = [re.sub(r'[^\\w\\s\\'\\,\\!\\?]', '', sent) for sent in TEST_CASES]\n",
    "\n",
    "# remove '\\t' from TEST_CASES list of strings\n",
    "TEST_CASES = [re.sub(r'\\t', '', sent) for sent in TEST_CASES]\n",
    "\n",
    "# remove numbers from TEST_CASES list of strings\n",
    "TEST_CASES = [re.sub(r'\\d', '', sent) for sent in TEST_CASES]\n",
    "\n",
    "# remove \"SA\", \"tr\", \"ma\", \"txt\" and \"ntxt\" from TEST_CASES list of strings\n",
    "TEST_CASES = [re.sub(r'SA|SB|tr|ma|txt|ntxt', '', sent) for sent in TEST_CASES]\n",
    "\n",
    "# strip all strings from TEST_CASES list of strings\n",
    "TEST_CASES = [sent.strip() for sent in TEST_CASES]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "# read antconc_results_ICE-GB.xlsx dataframe \"antconc_GB\" with columns \"Number\", \"Context Left\", \"Contraction\", \"Contraction Meaning\", \"Context Right\" and \"Speaker Information\"\n",
    "# antconc_GB = pd.read_excel(\"./nlp-contraptions/antconc_results_ICE-GB.xlsx\", sheet_name=\"antconc_results_ICE-GB\", usecols=[\"Number\", \"Context Left\", \"Contraction\", \"Contraction Meaning\", \"Context Right\", \"Speaker Information\"])\n",
    "# antconc_GB = pd.read_excel(\"../antconc_results_ICE-GB.xlsx\", sheet_name=\"antconc_results_ICE-GB\")\n",
    "antconc_GB = pd.read_excel(\"../antconc_GB_fixed.xlsx\")\n",
    "antconc_SL = pd.read_excel(\"../antconc_SL_fixed.xlsx\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number                                                                 1\n",
      "Context Left            183:1:B> In a half-hearted way <,> <#184:1:A>...\n",
      "Contraction                                                         he'd\n",
      "Contraction Meaning                                             he would\n",
      "Context Right                       he 'd <#185:1:A> I don't know if he \n",
      "Speaker Information                                          s1a-073.txt\n",
      "full context           In a half-hearted way But he'd I don't know if he\n",
      "Name: 0, dtype: object\n",
      "\n",
      "Number                                                                 1\n",
      "Context Left            /}> without <{><[>the without that push</[> w...\n",
      "Contraction                                                         we'd\n",
      "Contraction Meaning                                             we would\n",
      "Context Left.1                       we'd</w> able to go that high up <$\n",
      "Speaker Information                        S1A1-039 tr2 ma2 17-12-22.txt\n",
      "full context             without without that we able to go that high up\n",
      "Name: 0, dtype: object\n"
     ]
    }
   ],
   "source": [
    "# print df dataframe headers\n",
    "print(antconc_GB.iloc[0])\n",
    "print()\n",
    "print(antconc_SL.iloc[0])\n",
    "# _test = antconc_GB.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add new row \"full context\" to dataframe \"antconc_GB\"\n",
    "antconc_GB[\"full context\"] = \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pandas(Index=0, Number=1, _2=' 183:1:B> In a half-hearted way <,> <#184:1:A> But ', Contraction=\"he 'd\", _4='unclear', _5=\"he 'd <#185:1:A> I don't know if he \", _6='s1a-073.txt', _7=\"In a half-hearted way But he'd I don't know if he\")\n"
     ]
    }
   ],
   "source": [
    "\n",
    "for row in antconc_GB.itertuples():\n",
    "    full_context = row._2 + row._5\n",
    "    _words = full_context.strip().split(' ')\n",
    "    # filter _words list to remove strings starting or ending with \"<\" or \">\"\n",
    "    _words = [word for word in _words if not word.startswith('<') and not word.endswith('>')]\n",
    "    # join _words list with spaces\n",
    "    _words = ' '.join(_words)\n",
    "\n",
    "    words_with_apostrophe = re.findall(r'[\\w]+\\s\\'[\\w]+', _words)\n",
    "\n",
    "    # delete words_with_apostrophe from _words list\n",
    "    # and replace them with words_with_apostrophe equivalent, without spaces\n",
    "    # e.g. \"I 'm\" -> \"I'm\"\n",
    "    for word in words_with_apostrophe:\n",
    "        _words = _words.replace(word, word.replace(\" \", \"\"))\n",
    "\n",
    "    # add _words as string to \"full context\" column of dataframe \"antconc_GB\"\n",
    "    antconc_GB.loc[row.Index, \"full context\"] = _words\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Number                                                                 3\n",
       "Context Left                   like a waste of time and money <#43:2:A> \n",
       "Contraction                                                        He 'd\n",
       "Contraction Meaning                                               he had\n",
       "Context Right          He 'd abandoned his collapsing country for a w...\n",
       "Speaker Information                                          s2b-040.txt\n",
       "full context           like a waste of time and money He'd abandoned ...\n",
       "Name: 2, dtype: object"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "antconc_GB.iloc[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for each row get contraction from \"Contraction\" column and add fixed contraption to \"Contraction Meaning\" column\n",
    "for row in antconc_GB.itertuples():\n",
    "    contraption = row.Contraction\n",
    "    # remove all whitespace ( and spaces ) from contraption\n",
    "    contraption = contraption.replace(\" \", \"\")\n",
    "    # update \"Contraction\" column with fixed contraption\n",
    "    antconc_GB.loc[row.Index, \"Contraction\"] = contraption\n",
    "    contraption_meaning = contractions.fix(contraption)\n",
    "    antconc_GB.loc[row.Index, \"Contraction Meaning\"] = contraption_meaning\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Number                                                               101\n",
       "Context Left                Philip agreed <,> <#236:1:B> So of c I knew \n",
       "Contraction                                                         we'd\n",
       "Contraction Meaning                                             we would\n",
       "Context Right                        we 'd be out of pocket over it but \n",
       "Speaker Information                                          s1a-005.txt\n",
       "full context           Philip agreed So of c I knew we'd be out of po...\n",
       "Name: 100, dtype: object"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "antconc_GB.iloc[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "# write antconc_GB to new excel file \"antconc_GB_fixed.xlsx\"\n",
    "antconc_GB.to_excel(\"./antconc_GB_fixed.xlsx\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "# do the same as above for \"antconc_results_ICE-SL.xlsx\"\n",
    "antconc_SL = pd.read_excel(\"./antconc_results_ICE-SL.xlsx\", sheet_name=\"antconc_results_ICE-SL\")\n",
    "antconc_SL[\"full context\"] = \"\"\n",
    "for row in antconc_SL.itertuples():\n",
    "    full_context = row._2 + row._5\n",
    "    _words = full_context.strip().split(' ')\n",
    "    _words = [word for word in _words if not word.startswith('<') and not word.endswith('>')]\n",
    "    _words = ' '.join(_words)\n",
    "    words_with_apostrophe = re.findall(r'[\\w]+\\s\\'[\\w]+', _words)\n",
    "    for word in words_with_apostrophe:\n",
    "        _words = _words.replace(word, word.replace(\" \", \"\"))\n",
    "    antconc_SL.loc[row.Index, \"full context\"] = _words\n",
    "\n",
    "for row in antconc_SL.itertuples():\n",
    "    contraption = row.Contraction\n",
    "    contraption = contraption.replace(\" \", \"\")\n",
    "    contraption_meaning = contractions.fix(contraption)\n",
    "    antconc_SL.loc[row.Index, \"Contraction\"] = contraption\n",
    "    antconc_SL.loc[row.Index, \"Contraction Meaning\"] = contraption_meaning\n",
    "\n",
    "antconc_SL.to_excel(\"./antconc_SL_fixed.xlsx\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %load ./expander.py\n",
    "#!/usr/bin/env python\n",
    "\n",
    "\"\"\" Module for expanding contractions in english text. \"\"\"\n",
    "\n",
    "__author__ = \"Yannick Couzinié\"\n",
    "\n",
    "# standard library imports\n",
    "import itertools\n",
    "import operator\n",
    "import yaml\n",
    "# third party library imports\n",
    "import nltk\n",
    "# local imports\n",
    "import utils\n",
    "\n",
    "\n",
    "def _extract_contractions(sent):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        sent - a single sentence split up into (word, pos) tuples.\n",
    "    Returns:\n",
    "        List with the indices in the sentence where the contraction\n",
    "        starts.\n",
    "        Or None if no contractions are in the sentence.\n",
    "\n",
    "    Based on the POS-tags and the existence of an apostrophe or not,\n",
    "    extract the existing contractions.\n",
    "    \"\"\"\n",
    "    idx_lst = []\n",
    "    for i, word_pos in enumerate(sent):\n",
    "        # If the word in the word_pos tuple begins with an apostrophe,\n",
    "        # add the index to idx_list.\n",
    "        if word_pos[0][0] == \"'\":\n",
    "            if word_pos[1] != 'POS':\n",
    "                # POS stands for possessive pronoun\n",
    "                idx_lst.append(i)\n",
    "        elif word_pos[0] == \"n't\":\n",
    "            # n't is treated extraordinarily and added explicitly\n",
    "            idx_lst.append(i)\n",
    "    if idx_lst:\n",
    "        return idx_lst\n",
    "\n",
    "\n",
    "def _consecutive_sub_list(int_list):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        - int_list is a list whose consecutive sub-lists are yielded\n",
    "          from this function.\n",
    "    Yields:\n",
    "        - The consecutive sub-lists\n",
    "\n",
    "    This is basically an adaptation from\n",
    "    https://docs.python.org/2.6/library/itertools.html#examples for\n",
    "    Python 3.\n",
    "    \"\"\"\n",
    "    # we group the items by using the lambda-function for the key which\n",
    "    # checks whether the next element and the current element is one\n",
    "    # apart. If it it is exactly one, the list of items that are 1 apart\n",
    "    # are grouped.\n",
    "    # The map with the itemgetter then maps the grouping to the actual\n",
    "    # items and then we yield the sublists.\n",
    "    for _, index in itertools.groupby(enumerate(int_list),\n",
    "                                      lambda x: x[1]-x[0]):\n",
    "        yield list(map(operator.itemgetter(1), index))\n",
    "\n",
    "\n",
    "def _return_replacement(inp_tuple, argmax, disambiguations):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        - inp_tuple = the input tuple which needs to be counterchecked with the\n",
    "                      dictionary.\n",
    "        - disambiguations = dictionary with all the replacements.\n",
    "        - argmax = boolean of whether to take the argmax or not, in case of\n",
    "                   ambiguous cases.\n",
    "    Returns:\n",
    "        - The recommended replacements as stored in the values of\n",
    "          disambiguations. It returns None in case of no replacements.\n",
    "\n",
    "    This function strictly serves to return the replacements for ambiguous\n",
    "    cases, i.e. as stored in disambiguations.yaml.\n",
    "    \"\"\"\n",
    "\n",
    "    if inp_tuple in disambiguations:\n",
    "        if len(disambiguations[inp_tuple].keys()) == 1:\n",
    "            # if this is unambiguous just handle it\n",
    "            return list(disambiguations[inp_tuple])[0]\n",
    "        else:\n",
    "            if not argmax:\n",
    "                # if one should not take the argmax just replace nothing. This\n",
    "                # is not recommended, but in the future it might be interesting\n",
    "                # to differentiate the cases.\n",
    "                return None\n",
    "            # if it is ambiguous find the case with the most occurences\n",
    "            max_val = max(disambiguations[inp_tuple].values())\n",
    "            if list(disambiguations[inp_tuple].values()).count(max_val) == 1:\n",
    "                # if there is exactly one replacement with the highest\n",
    "                # value, choose that\n",
    "                for key, value in disambiguations[inp_tuple].items():\n",
    "                    if value == max_val:\n",
    "                        return key\n",
    "            else:\n",
    "                # if it is still ambigious just stop at this point and\n",
    "                # work on the disambiguations dictionary.\n",
    "                return None\n",
    "    else:\n",
    "        # if the case is not even in the dictionary just skip it and\n",
    "        # work on the disambiguations dictionary.\n",
    "        return None\n",
    "\n",
    "\n",
    "def _disambiguate(sent, rplc_tuple, disambiguations, add_tags,\n",
    "                  argmax=True):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        - sent is the same sentence as in rplc_tuple but with the\n",
    "          pos_tags.\n",
    "        - rplc_tuple is the tuple containint the index of replacement,\n",
    "          the suggested replacements and the sentence.\n",
    "        - disambiguations dictionary\n",
    "        - add_tags is the amount of additional tags in the disambi\n",
    "        - in case the disambiguation case is also ambiguous use the case\n",
    "          with more occurences in the corpus. If that still doesn't help\n",
    "          don't change the input.\n",
    "    Returns:\n",
    "        - the expanded sentence (as far as unambiguous).\n",
    "\n",
    "    Use the disambiguation dictionary to disambiguate the expansions.\n",
    "    \"\"\"\n",
    "    # first we need to check again whether the first word is capitalized\n",
    "    # a special case is when the first sign actually is an apostrophe like 't\n",
    "    # (in 'tis)\n",
    "    # so check wheter one of the first two characters is upper and make sure\n",
    "    # that it is not the ner_tag\n",
    "    # if ((sent[0][0][0].isupper() or sent[0][0][1].isupper())\n",
    "    #         and sent[0][0] != \"<NE>\"):\n",
    "    #     capitalized = True\n",
    "    #     sent[0] = (sent[0][0].lower(), sent[0][1])\n",
    "    # else:\n",
    "    capitalized = False\n",
    "    # make the input tuple which is of the form of the dictionary keys\n",
    "    inp_tuple = [sent[i] for i in rplc_tuple[0]]\n",
    "    # append the pos tags for the rest\n",
    "    \n",
    "    try:\n",
    "        inp_tuple += [sent[i][1] for i in range(rplc_tuple[0][-1]+1,\n",
    "                                                rplc_tuple[0][-1]+1+add_tags)]\n",
    "    except Exception as e:\n",
    "        return sent\n",
    "\n",
    "    inp_tuple = tuple(inp_tuple)\n",
    "\n",
    "    # analyze disambiguations for the correct replacement\n",
    "    replacement = _return_replacement(inp_tuple, argmax, disambiguations)\n",
    "    # now do the replacements\n",
    "    sent = _remove_pos_tags(sent)\n",
    "    if replacement is not None:\n",
    "        for i, index in enumerate(rplc_tuple[0]):\n",
    "            sent[index] = replacement.split()[i]\n",
    "\n",
    "    if capitalized:\n",
    "        sent[0] = sent[0].title()\n",
    "    return sent\n",
    "\n",
    "\n",
    "def _check_if_contr_in_dict(consecutive, sent, contractions):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        - consecutive = a list of consecutive indices at which sent contains\n",
    "                        contractions.\n",
    "        - sent = a (word, pos_tag) list, whereby the words make up a sentence.\n",
    "        - contractions = the contractions dictionary.\n",
    "    Returns:\n",
    "        - the list of possible expansions.\n",
    "    Raises:\n",
    "        - ValueError if the contractions have questionable capitalization,\n",
    "          which will not be reproduced upon expansion since that would be too\n",
    "          cumbersome.\n",
    "    \"\"\"\n",
    "    # combine all the words that are expanded, i.e. one word\n",
    "    # before the first apostrophe until the last one with an\n",
    "    # apostrophe\n",
    "    contr = [word_pos[0] for word_pos\n",
    "             in sent[consecutive[0]:consecutive[-1]+1]]\n",
    "    # if the expanded string is one of the known contractions,\n",
    "    # extract the suggested expansions.\n",
    "    # Note that however many expansions there are, expanded is a list!\n",
    "    if ''.join(contr) in contractions:\n",
    "        expanded = contractions[''.join(contr)]\n",
    "    # the dictionary only contains non-capitalized replacements,\n",
    "    # check for capitalization\n",
    "    elif ''.join(contr).lower() in contractions:\n",
    "        if ''.join(contr)[0].isupper() or ''.join(contr)[1].isupper():\n",
    "            # capitalize the replacement in this case\n",
    "            expanded = [a.capitalize() for a in\n",
    "                        contractions[''.join(contr).lower()]]\n",
    "        else:\n",
    "            raise ValueError(\"Weird capitalization error! Please use standard \"\n",
    "                             \"english grammar.\")\n",
    "    else:\n",
    "        # if the replacement is unknown skip to the next one\n",
    "        return None, contr\n",
    "    return expanded, contr\n",
    "\n",
    "\n",
    "def _extract_replacements(idx_lst, sent, contractions):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        idx_lst - The list of indices for the position of contractions\n",
    "                  in sent.\n",
    "        sent - List of (word, pos) tuples.\n",
    "        contractions - dictionary of contractions in the form of:\n",
    "                            'contracted string' : 'list of possible\n",
    "                                                   replacements'\n",
    "    Returns:\n",
    "        A list in the form of (tuples of (index of words to be replaced,\n",
    "                                          word to be replaced,\n",
    "                                          list of suggested replacements))\n",
    "        Examples are: ([0,1], [\"I\", \"'m\"], [\"I\", \"am\"])\n",
    "            ([0,1], [\"She\", \"'s\"], [[\"She\", \"is\"], [\"She\", \"has\"]])\n",
    "\n",
    "    Based on the idx_lst and the contractions dictionary, give a list of\n",
    "    replacements which shall be performed on the words in sent.\n",
    "    \"\"\"\n",
    "    # loop over all the consecutive parts\n",
    "    for consecutive in _consecutive_sub_list(idx_lst):\n",
    "        # first test the consecutive list like this\n",
    "        expanded, contr = _check_if_contr_in_dict(consecutive,\n",
    "                                                  sent,\n",
    "                                                  contractions)\n",
    "\n",
    "        if expanded is None:\n",
    "            # add the one index prior to the first one for easier\n",
    "            consecutive = [consecutive[0]-1] + consecutive\n",
    "            expanded, contr = _check_if_contr_in_dict(consecutive,\n",
    "                                                      sent,\n",
    "                                                      contractions)\n",
    "        if expanded is None:\n",
    "            print(\"WARNING: Unknown replacement: \", ''.join(contr))\n",
    "            expanded = []\n",
    "\n",
    "        # separate the phrases into their respective words again.\n",
    "        # if \"<NE>\" in expanded[0]:\n",
    "        #     # insert a random name (here the name of a more or less famous\n",
    "        #     # japanese female head of the Ii family) to avoid <NE> being split.\n",
    "        #     expanded = [exp.replace(\"<NE>\", \"Naotora\") for exp in expanded]\n",
    "        #     expanded = [nltk.word_tokenize(a) for a in expanded]\n",
    "        #     for i, _sent in enumerate(expanded):\n",
    "        #         for j in [k for k, x in enumerate(_sent) if x == \"Naotora\"]:\n",
    "        #             expanded[i][j] = \"<NE>\"\n",
    "        # else:\n",
    "\n",
    "        expanded = [nltk.word_tokenize(a) for a in expanded]\n",
    "        yield (consecutive, contr, expanded)\n",
    "\n",
    "\n",
    "def _remove_pos_tags(sent):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        sent - list of (word, pos) tuples\n",
    "    Returns:\n",
    "        A list of only lexical items.\n",
    "\n",
    "    Convert a list of (word, pos) tuples back to a list of only words.\n",
    "    \"\"\"\n",
    "    output = []\n",
    "    for word_pos in sent:\n",
    "        output.append(word_pos[0])\n",
    "    return output\n",
    "\n",
    "\n",
    "def _do_replacements(sent, idx_lst, add_tags, contractions, disambiguations):\n",
    "\n",
    "    tmp = _remove_pos_tags(sent)\n",
    "    # only do something if there are any replacements\n",
    "    if idx_lst is None:\n",
    "        return tmp\n",
    "    # evaluate the needed replacements, and loop over them\n",
    "    for rplc_tuple in _extract_replacements(idx_lst,\n",
    "                                            sent,\n",
    "                                            contractions):\n",
    "\n",
    "        # if the replacement is unambiguous, do it.\n",
    "        if len(rplc_tuple[2]) == 1:\n",
    "            if len(rplc_tuple[1]) == len(rplc_tuple[2][0]):\n",
    "                # check that there is the exact amount of words to be\n",
    "                # replaced\n",
    "                for i, index in enumerate(rplc_tuple[0]):\n",
    "                    tmp[index] = rplc_tuple[2][0][i]\n",
    "            else:\n",
    "                for i, word in enumerate(rplc_tuple[2][0]):\n",
    "                    if i >= len(rplc_tuple[0]):\n",
    "                        # if the replacing string is longer than the\n",
    "                        # original text, we need to move all the elements\n",
    "                        # back to fit the new words in.\n",
    "\n",
    "                        # save the good text since it is not to be replaced\n",
    "                        tmp2 = tmp[rplc_tuple[0][0]+i:]\n",
    "                        # delete anything after the last replacement\n",
    "                        del tmp[rplc_tuple[0][0]+i:]\n",
    "                        # append to next words in the replacements, since from\n",
    "                        # now on every word will need to be treated like this\n",
    "                        tmp += word[i:]\n",
    "                        # add the good bits again\n",
    "                        tmp += tmp2\n",
    "                        break\n",
    "                    else:\n",
    "                        # otherwise just replace\n",
    "                        tmp[rplc_tuple[0][0]+i] = word\n",
    "                if len(rplc_tuple[2][0]) < len(rplc_tuple[0]):\n",
    "                    # if there is less to replace than there originally\n",
    "                    # was, remove anything that was not touched\n",
    "                    del tmp[rplc_tuple[0][0]+len(rplc_tuple[2][0]):\n",
    "                            rplc_tuple[0][-1]+1]\n",
    "        else:\n",
    "            # else deal with the ambiguous case\n",
    "            tmp = _disambiguate(sent, rplc_tuple,\n",
    "                                disambiguations, add_tags)\n",
    "    return tmp\n",
    "\n",
    "\n",
    "def expand_contractions(stanford_model,\n",
    "                        sent_list,\n",
    "                        is_split=True,\n",
    "                        use_ner=False,\n",
    "                        ner_args=None):\n",
    "    \"\"\"\n",
    "    Args:\n",
    "        stanford_model - object of StanfordPOSTagger, as returned by\n",
    "                         load_stanford_pos.\n",
    "        sent_list - list of sentences which are split up by word.\n",
    "                    For the splitting use nltk.word_tokenize.\n",
    "        is_split - boolean to track whether splitting has to be done\n",
    "                   or not. If it has to be done provide sentences as\n",
    "                   single strings.\n",
    "        use_ner - boolean to decide whether to use\n",
    "                  named-entity-recognition for a potential increase in\n",
    "                  accuracy but with the obvious costs of performance.\n",
    "        ner_args - is a list with an  object of StanfordNERTagger and\n",
    "                   the tag to be used. This only needs to be\n",
    "                   supplied if use_ner is true.\n",
    "    Returns:\n",
    "        sent_list with expanded contractions.\n",
    "\n",
    "    Raises:\n",
    "        ValueError if use_ner is True but no ner_model is supplied.\n",
    "\n",
    "    This method uses the StanfordPOSTagger tags to identify contractions in\n",
    "    the sentence and expand them sensibly. Some examples are:\n",
    "        \"I'm\" -> \"I am\"\n",
    "        \"It's difficult\" -> \"It is difficult\"\n",
    "    The difficulty is that sometimes \"'s\" is not an indicator of a\n",
    "    contraction but a possessive pronoun like\n",
    "        \"It's legs were shaking\"\n",
    "    which should not be expanded. The stanford tagger tags this as\n",
    "    \"POS\" for possessive which makes it easy to identify these cases.\n",
    "    Furthermore, a difficulty lies in the fact that the expansion is not\n",
    "    unique. Without context we have for example the following:\n",
    "        \"I'll\" -> \"I will\" or \"I shall\"\n",
    "    \"\"\"\n",
    "    if use_ner and (ner_args is None):\n",
    "        raise ValueError(\"The use_ner flag is True but no NER\"\n",
    "                         \" model has been supplied!\")\n",
    "\n",
    "    with open(\"contractions.yaml\", \"r\") as stream:\n",
    "        # load the dictionary containing all the contractions\n",
    "        contractions = yaml.load(stream)\n",
    "\n",
    "    with open(\"disambiguations.yaml\", \"r\") as stream:\n",
    "        disambiguations = yaml.load(stream)\n",
    "\n",
    "    # first we need to check how many additional tags there are\n",
    "    # for that take the first element of the keys list of the\n",
    "    # dictionary\n",
    "    add_tags = 0\n",
    "    for element in list(disambiguations)[0]:\n",
    "        # if the type is str and not tuple it is an additional tag\n",
    "        if isinstance(element, str):\n",
    "            add_tags += 1\n",
    "\n",
    "    output = []\n",
    "    # look at all the sentences in the list\n",
    "    for word_pos_ner in utils.conv_2_word_pos(stanford_model,\n",
    "                                              sent_list,\n",
    "                                              is_split=is_split,\n",
    "                                              use_ner=use_ner,\n",
    "                                              ner_args=ner_args):\n",
    "        if use_ner:\n",
    "            # the actual sentence is just the first element, the second\n",
    "            # one is the list of strings that were replaced (i.e. the\n",
    "            # named-entities).\n",
    "            sent = word_pos_ner[0]\n",
    "        else:\n",
    "            sent = word_pos_ner\n",
    "\n",
    "        # get all the indices of the contractions\n",
    "        idx_lst = _extract_contractions(sent)\n",
    "\n",
    "        sent = _do_replacements(sent,\n",
    "                                idx_lst,\n",
    "                                add_tags,\n",
    "                                contractions,\n",
    "                                disambiguations)\n",
    "        output.append(sent)\n",
    "        # at this point there is definetly the next item added to\n",
    "        # output. So just replace the NER-tag now\n",
    "        if use_ner:\n",
    "            # just replace it in the last element\n",
    "            output[-1] = utils.ner_to_sent(output[-1],\n",
    "                                           word_pos_ner[1],\n",
    "                                           tag=ner_args[1])\n",
    "    if not is_split:\n",
    "        # join the sentences if they were joined in the beginning\n",
    "        output = [' '.join(sent) for sent in output]\n",
    "        # remove the space in front of the punctuations.\n",
    "        output = [sent.replace(\" '\", \"'\") for sent in output]\n",
    "        output = [sent.replace(\" ;\", \";\") for sent in output]\n",
    "        output = [sent.replace(\" :\", \":\") for sent in output]\n",
    "        output = [sent.replace(\" .\", \".\") for sent in output]\n",
    "        output = [sent.replace(\" ,\", \",\") for sent in output]\n",
    "        output = [sent.replace(\" !\", \"!\") for sent in output]\n",
    "        output = [sent.replace(\" ?\", \"?\") for sent in output]\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"but he'd I don't\""
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "_words = \"but he 'd I don't\"\n",
    "words_with_apostrophe = re.findall(r'[\\w]+\\s\\'[\\w]+', _words)\n",
    "for word in words_with_apostrophe:\n",
    "    _words = _words.replace(word, word.replace(\" \", \"\"))\n",
    "_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add new row to antconc_SL dataframe, named \"done\"\n",
    "# and set it to False\n",
    "antconc_SL[\"done\"] = False\n",
    "antconc_GB[\"done\"] = False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "LOG = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: Unknown replacement:  employee'\n",
      "################################################################\n",
      "############## ERROR WITH 649  ###############\n",
      "################################################################\n",
      "-sl s a- b how many of you'll have spoken to your employee'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kid/miniforge3/lib/python3.9/site-packages/nltk/tag/stanford.py:149: DeprecationWarning: \n",
      "The StanfordTokenizer will be deprecated in version 3.2.5.\n",
      "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n",
      "  super(StanfordPOSTagger, self).__init__(*args, **kwargs)\n",
      "/Users/kid/miniforge3/lib/python3.9/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: \n",
      "The StanfordTokenizer will be deprecated in version 3.2.5.\n",
      "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n",
      "  super(StanfordNERTagger, self).__init__(*args, **kwargs)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING: Unknown replacement:  we'\n",
      "################################################################\n",
      "############## ERROR WITH 664  ###############\n",
      "################################################################\n",
      "in the trees as green so we'll if someone asks us we'\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/kid/miniforge3/lib/python3.9/site-packages/nltk/tag/stanford.py:149: DeprecationWarning: \n",
      "The StanfordTokenizer will be deprecated in version 3.2.5.\n",
      "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n",
      "  super(StanfordPOSTagger, self).__init__(*args, **kwargs)\n",
      "/Users/kid/miniforge3/lib/python3.9/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: \n",
      "The StanfordTokenizer will be deprecated in version 3.2.5.\n",
      "Please use \u001b[91mnltk.tag.corenlp.CoreNLPPOSTagger\u001b[0m or \u001b[91mnltk.tag.corenlp.CoreNLPNERTagger\u001b[0m instead.\n",
      "  super(StanfordNERTagger, self).__init__(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "from IPython.display import clear_output\n",
    "from tqdm import tqdm\n",
    "\n",
    "# for df in [antconc_SL, antconc_GB]:\n",
    "for df in [antconc_GB, antconc_SL]:\n",
    "    # for row in df.itertuples():\n",
    "    for row in df.itertuples():\n",
    "        # print(row)\n",
    "        # break\n",
    "\n",
    "        # clear_output(wait=True)\n",
    "\n",
    "        if row.done == True:\n",
    "            continue\n",
    "\n",
    "        # Pandas(Index=0, Number=1, _2=' /}> without <{><[>the without that push</[> we <w>', Contraction=\"we'd\", _4='we would', _5=\"we'd</w> able to go that high up <$\", _6='S1A1-039 tr2 ma2 17-12-22.txt', _7='without without that we able to go that high up')\n",
    "\n",
    "        sentence = row._2 + \" \" + row._5\n",
    "        sentence = sentence.strip()\n",
    "        # e.g. raw sentence ` /}> without <{><[>the without that push</[> we <w>we'd</w> able to go that high up <$`\n",
    "\n",
    "        # remove brackets and brackets content\n",
    "        sentence = re.sub(r\"<[^>]*>\", \"\", sentence)\n",
    "\n",
    "        # remove words that contain non letters or commas or dots or hyphens or apostrophes\n",
    "        sentence = re.sub(r\"[^a-zA-Z,\\.\\-\\']\", \" \", sentence)\n",
    "        sentence = sentence.strip()\n",
    "        sentence = sentence.lower()\n",
    "\n",
    "        # capitalize the \"I\" in words if it is followed by a apostrophe\n",
    "        # e.g. \"i've\" -> \"I've\"\n",
    "        sentence = re.sub(r\"i\\'\", \"I'\", sentence)\n",
    "\n",
    "        # only one space inbetween words\n",
    "        sentence = re.sub(r\"\\s+\", \" \", sentence)\n",
    "        sentence = \" \".join(list(map(lambda x: x.upper() if x == \"i\" else x, sentence.split())))\n",
    "\n",
    "        words_with_apostrophe = re.findall(r'[\\w]+\\s\\'[\\w]+', sentence)\n",
    "        for word in words_with_apostrophe:\n",
    "            sentence = sentence.replace(word, word.replace(\" \", \"\"))\n",
    "\n",
    "        if sentence == \"\":\n",
    "            continue\n",
    "\n",
    "        df.loc[row.Index, \"full context\"] = sentence\n",
    "\n",
    "        # import pandas as pd\n",
    "        # df = pd.DataFrame([sentence])\n",
    "        # df.to_clipboard(index=False,header=False)\n",
    "        # print(sentence)\n",
    "\n",
    "        # break\n",
    "\n",
    "        # use nltk to split the strings into words\n",
    "        POS_MODEL = utils.load_stanford(model=\"pos\")\n",
    "        NER_MODEL = utils.load_stanford(model=\"ner\")\n",
    "\n",
    "        TEST_CASES = [sentence]\n",
    "\n",
    "        # expand the sentences\n",
    "        try:\n",
    "            EXPANDED_LIST = expand_contractions(\n",
    "                POS_MODEL, TEST_CASES, is_split=False, use_ner=False\n",
    "            )\n",
    "        except Exception as e:\n",
    "            # shit went wrong...\n",
    "\n",
    "            print(\"################################################################\")\n",
    "            print(f\"############## ERROR WITH {row.Index}  ###############\")\n",
    "            print(\"################################################################\")\n",
    "\n",
    "            LOG += [e, sentence]\n",
    "            print(LOG[-1])\n",
    "            continue\n",
    "\n",
    "        # for SENT in EXPANDED_LIST:\n",
    "        #     print(SENT)\n",
    "\n",
    "        # print differences between the strings from TEST_CASES[0] and EXPANDED_LIST[0]\n",
    "        _diff = difflib.ndiff(TEST_CASES[0].split(), EXPANDED_LIST[0].split())\n",
    "        contraptions = []\n",
    "        expanded = []\n",
    "        for d in list(_diff):\n",
    "            if d.startswith(\"-\"):\n",
    "                contraptions.append(d[2:].strip().lower())\n",
    "            elif d.startswith(\"+\"):\n",
    "                expanded.append(d[2:].strip().lower())\n",
    "\n",
    "        # connect word pairs from expanded with spaces inbetween\n",
    "        expanded = [\" \".join(pair) for pair in zip(expanded[::2], expanded[1::2])]\n",
    "        contraction_epanded_dict = dict(zip(contraptions, expanded))\n",
    "\n",
    "        # _4 == \"Contraction Meaning\"\n",
    "\n",
    "        # skip if models doesn't find contractions\n",
    "        if row.Contraction.strip().lower() not in contraction_epanded_dict:\n",
    "            print(f\"############## SKIPPING {row.Index}  ###############\")\n",
    "            print(sentence)\n",
    "            print(row.Contraction)\n",
    "            print(contraction_epanded_dict)\n",
    "            print(\"################################################################\")\n",
    "            continue\n",
    "\n",
    "        df.loc[row.Index, \"Contraction Meaning\"] = contraction_epanded_dict[\n",
    "            row.Contraction.lower()\n",
    "        ]\n",
    "        df.loc[row.Index, \"done\"] = True\n",
    "\n",
    "        print(\"################################################################\")\n",
    "        print(f\"############## DONE WITH {row.Index}  ###############\")\n",
    "        print(\"################################################################\")\n",
    "\n",
    "\n",
    "        if row.Index % 100 == 0:\n",
    "            antconc_GB.to_excel(\"./antconc_GB_progress.xlsx\", index=False)\n",
    "            antconc_SL.to_excel(\"./antconc_SL_progress.xlsx\", index=False)\n",
    "\n",
    "        # print(row)\n",
    "        # break\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1076\n"
     ]
    }
   ],
   "source": [
    "# sum number of all rows in antconc_SL dataframe which have \"done\" set to True\n",
    "print(antconc_SL[antconc_SL[\"done\"] == True].shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Number                                                                 1\n",
       "Context Left            /}> without <{><[>the without that push</[> w...\n",
       "Contraction                                                         we'd\n",
       "Contraction Meaning                                               we had\n",
       "Context Left.1                       we'd</w> able to go that high up <$\n",
       "Speaker Information                        S1A1-039 tr2 ma2 17-12-22.txt\n",
       "full context                without the without that push we we'd abl...\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "antconc_SL.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "antconc.to_csv(\"antconc_SL_expanded.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "was the side-effects of encephalitis from which they had all suffered in the nineteen twenties\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[\"they'd\"]\n"
     ]
    }
   ],
   "source": [
    "print(contraptions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['they had']\n"
     ]
    }
   ],
   "source": [
    "print(expanded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "c6a95d4d55baca1bf8ed8b8c92a9b1baf798d5a61792e9862de346df9cd01418"
  },
  "kernelspec": {
   "display_name": "Python 3.9.9 64-bit ('base': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## linguistics.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              linguistics.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.