kylebgorman/HW6-solution.ipynb Secret

## HW6-solution.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "X5isTxpIrb53"
   },
   "source": [
    "# HW6 solution"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "3qNxJXQprXn9"
   },
   "source": [
    "## Part 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Ux8G9GR8rTH8",
    "outputId": "8aa9d93d-4e01-42e1-c454-2365aca92ffe"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 80949  100 80949    0     0   136k      0 --:--:-- --:--:-- --:--:--  136k\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n",
    "\n",
    "import nltk\n",
    "assert nltk.download(\"punkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9ftxeUi4rjrE",
    "outputId": "4fb718d8-8b99-4a4f-e31c-b83d7d8fdfd5"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The ten shortest words in the corpus are:\n",
      "\t'\n",
      "\t:\n",
      "\t(\n",
      "\ta\n",
      "\t;\n",
      "\t.\n",
      "\t-\n",
      "\t!\n",
      "\t)\n",
      "\t,\n",
      "\n",
      "The ten longest words in the corpus are:\n",
      "\tfairytale-worthy\n",
      "\talways-in-a-rush\n",
      "\tresponsibilities\n",
      "\tbehind-the-scenes\n",
      "\tless-conventional\n",
      "\tunder-appreciated\n",
      "\tprivate-investigator\n",
      "\tlotharios-or-lotharias\n",
      "\tkinda-sorta-not-really\n",
      "\twhat-you-see-is-what-you-get\n"
     ]
    }
   ],
   "source": [
    "lexicon = set()\n",
    "with open(\"scorpio.txt\", \"r\") as source:\n",
    "     for line in source:\n",
    "         tokens = nltk.word_tokenize(line)\n",
    "         lexicon.update(token.casefold() for token in tokens)\n",
    "lexicon_by_len = sorted(lexicon, key=len)\n",
    "print(\"The ten shortest words in the corpus are:\")\n",
    "for word in lexicon_by_len[:10]:\n",
    "    print(f\"\\t{word}\")\n",
    "print()  # Blank line. It looks nice.\n",
    "print(\"The ten longest words in the corpus are:\")\n",
    "for word in lexicon_by_len[-10:]:\n",
    "   print(f\"\\t{word}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "D9vnVEbpswl9"
   },
   "source": [
    "## Stretch goal 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "i2_KoPMhsx6_",
    "outputId": "adf430ee-53c6-4e8b-bff4-7dc5bbe109ff"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The ten shortest words in the corpus are:\n",
      "\ta\n",
      "\tgo\n",
      "\t24\n",
      "\ton\n",
      "\tbe\n",
      "\tdo\n",
      "\tno\n",
      "\twe\n",
      "\tse\n",
      "\tto\n",
      "\n",
      "The ten longest words in the corpus are:\n",
      "\tfairytale-worthy\n",
      "\talways-in-a-rush\n",
      "\tresponsibilities\n",
      "\tbehind-the-scenes\n",
      "\tless-conventional\n",
      "\tunder-appreciated\n",
      "\tprivate-investigator\n",
      "\tlotharios-or-lotharias\n",
      "\tkinda-sorta-not-really\n",
      "\twhat-you-see-is-what-you-get\n"
     ]
    }
   ],
   "source": [
    "import string\n",
    "\n",
    "\n",
    "punctuation = frozenset(string.punctuation)\n",
    "\n",
    "lexicon = set()\n",
    "with open(\"scorpio.txt\", \"r\") as source:\n",
    "     for line in source:\n",
    "         tokens = nltk.word_tokenize(line)\n",
    "         # Note change here.\n",
    "         lexicon.update(\n",
    "             token.casefold() for token in tokens if token not in punctuation\n",
    "         )\n",
    "lexicon_by_len = sorted(lexicon, key=len)\n",
    "print(\"The ten shortest words in the corpus are:\")\n",
    "for word in lexicon_by_len[:10]:\n",
    "    print(f\"\\t{word}\")\n",
    "print()\n",
    "print(\"The ten longest words in the corpus are:\")\n",
    "for word in lexicon_by_len[-10:]:\n",
    "   print(f\"\\t{word}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "uBpBgC1YtXg-"
   },
   "source": [
    "Arguably, this is more what one has in mind when one thinks about the shortest and longest words in a corpus."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6ccEeg4Atfcd"
   },
   "source": [
    "## Stretch goal 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zd7X25r1tegi",
    "outputId": "9daf7a60-b327-4b84-819b-810a134162a7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The ten shortest words in the corpus are:\n",
      "\ta\n",
      "\tgo\n",
      "\t24\n",
      "\ton\n",
      "\tbe\n",
      "\tdo\n",
      "\tno\n",
      "\twe\n",
      "\tse\n",
      "\tto\n",
      "\n",
      "The ten longest words in the corpus are:\n",
      "\twhat-you-see-is-what-you-get\n",
      "\tlotharios-or-lotharias\n",
      "\tkinda-sorta-not-really\n",
      "\tprivate-investigator\n",
      "\tbehind-the-scenes\n",
      "\tless-conventional\n",
      "\tunder-appreciated\n",
      "\tfairytale-worthy\n",
      "\talways-in-a-rush\n",
      "\tresponsibilities\n"
     ]
    }
   ],
   "source": [
    "import heapq\n",
    "\n",
    "\n",
    "lexicon = set()\n",
    "with open(\"scorpio.txt\", \"r\") as source:\n",
    "     for line in source:\n",
    "         tokens = nltk.word_tokenize(line)\n",
    "         lexicon.update(\n",
    "             token.casefold() for token in tokens if token not in punctuation\n",
    "         )\n",
    "selection = heapq.nsmallest(10, lexicon, key=len)\n",
    "print(\"The ten shortest words in the corpus are:\")\n",
    "for word in selection:\n",
    "    print(f\"\\t{word}\")\n",
    "print()  # Blank line. It looks nice.\n",
    "selection = heapq.nlargest(10, lexicon, key=len)\n",
    "print(\"The ten longest words in the corpus are:\")\n",
    "for word in selection:\n",
    "   print(f\"\\t{word}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "EePR2GSUuTGD"
   },
   "source": [
    "Whereas sorting the entire lexicon has complexity $O(n \\log n)$, finding the $k$ longest or shortest words with the heap operations as complexity $O(n \\log k)$. When $n$ (the size of the list) is quite large and $k$ (the number of desired elements) is quite a bit smaller, this is a substantial improvement."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "6X07X3swvM_o"
   },
   "source": [
    "## Part 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "UVewIHyfvN1E",
    "outputId": "0bf3026e-1703-4d70-8545-d2aeb6d12ef6"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100   271  100   271    0     0   1328      0 --:--:-- --:--:-- --:--:--  1328\n"
     ]
    }
   ],
   "source": [
    "! curl -O https://www.wellformedness.com/courses/LING78100/data/inventory.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "446jDCPW2jWr"
   },
   "outputs": [],
   "source": [
    "inventory = {}\n",
    "with open(\"inventory.txt\", \"r\") as source:\n",
    "     for line in source:\n",
    "         line = line.rstrip()\n",
    "         if \" \" in line:\n",
    "             item, count_str = line.split()\n",
    "             # Not strictly necessary, but makes things easier to read.\n",
    "             item = item.replace(\"-\", \" \")\n",
    "             count = int(count_str)\n",
    "             inventory[item] = count\n",
    "         else:\n",
    "             # Ditto.\n",
    "             item = line.replace(\"-\", \" \")\n",
    "             inventory[item] = 1\n",
    "assert inventory[\"Hardtack\"] == 5\n",
    "assert \"Sword Of Damocles\" in inventory\n",
    "assert \"Killer Whale Potion\" not in inventory\n",
    "assert \"Bloodsword\" not in inventory"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Part 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Your reflection here."
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "X5isTxpIrb53"
	},
	"source": [
	"# HW6 solution"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "3qNxJXQprXn9"
	},
	"source": [
	"## Part 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Ux8G9GR8rTH8",
	"outputId": "8aa9d93d-4e01-42e1-c454-2365aca92ffe"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" % Total % Received % Xferd Average Speed Time Time Time Current\n",
	" Dload Upload Total Spent Left Speed\n",
	"100 80949 100 80949 0 0 136k 0 --:--:-- --:--:-- --:--:-- 136k\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
	"[nltk_data] Package punkt is already up-to-date!\n"
	]
	}
	],
	"source": [
	"! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n",
	"\n",
	"import nltk\n",
	"assert nltk.download(\"punkt\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "9ftxeUi4rjrE",
	"outputId": "4fb718d8-8b99-4a4f-e31c-b83d7d8fdfd5"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The ten shortest words in the corpus are:\n",
	"\t'\n",
	"\t:\n",
	"\t(\n",
	"\ta\n",
	"\t;\n",
	"\t.\n",
	"\t-\n",
	"\t!\n",
	"\t)\n",
	"\t,\n",
	"\n",
	"The ten longest words in the corpus are:\n",
	"\tfairytale-worthy\n",
	"\talways-in-a-rush\n",
	"\tresponsibilities\n",
	"\tbehind-the-scenes\n",
	"\tless-conventional\n",
	"\tunder-appreciated\n",
	"\tprivate-investigator\n",
	"\tlotharios-or-lotharias\n",
	"\tkinda-sorta-not-really\n",
	"\twhat-you-see-is-what-you-get\n"
	]
	}
	],
	"source": [
	"lexicon = set()\n",
	"with open(\"scorpio.txt\", \"r\") as source:\n",
	" for line in source:\n",
	" tokens = nltk.word_tokenize(line)\n",
	" lexicon.update(token.casefold() for token in tokens)\n",
	"lexicon_by_len = sorted(lexicon, key=len)\n",
	"print(\"The ten shortest words in the corpus are:\")\n",
	"for word in lexicon_by_len[:10]:\n",
	" print(f\"\\t{word}\")\n",
	"print() # Blank line. It looks nice.\n",
	"print(\"The ten longest words in the corpus are:\")\n",
	"for word in lexicon_by_len[-10:]:\n",
	" print(f\"\\t{word}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "D9vnVEbpswl9"
	},
	"source": [
	"## Stretch goal 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "i2_KoPMhsx6_",
	"outputId": "adf430ee-53c6-4e8b-bff4-7dc5bbe109ff"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The ten shortest words in the corpus are:\n",
	"\ta\n",
	"\tgo\n",
	"\t24\n",
	"\ton\n",
	"\tbe\n",
	"\tdo\n",
	"\tno\n",
	"\twe\n",
	"\tse\n",
	"\tto\n",
	"\n",
	"The ten longest words in the corpus are:\n",
	"\tfairytale-worthy\n",
	"\talways-in-a-rush\n",
	"\tresponsibilities\n",
	"\tbehind-the-scenes\n",
	"\tless-conventional\n",
	"\tunder-appreciated\n",
	"\tprivate-investigator\n",
	"\tlotharios-or-lotharias\n",
	"\tkinda-sorta-not-really\n",
	"\twhat-you-see-is-what-you-get\n"
	]
	}
	],
	"source": [
	"import string\n",
	"\n",
	"\n",
	"punctuation = frozenset(string.punctuation)\n",
	"\n",
	"lexicon = set()\n",
	"with open(\"scorpio.txt\", \"r\") as source:\n",
	" for line in source:\n",
	" tokens = nltk.word_tokenize(line)\n",
	" # Note change here.\n",
	" lexicon.update(\n",
	" token.casefold() for token in tokens if token not in punctuation\n",
	" )\n",
	"lexicon_by_len = sorted(lexicon, key=len)\n",
	"print(\"The ten shortest words in the corpus are:\")\n",
	"for word in lexicon_by_len[:10]:\n",
	" print(f\"\\t{word}\")\n",
	"print()\n",
	"print(\"The ten longest words in the corpus are:\")\n",
	"for word in lexicon_by_len[-10:]:\n",
	" print(f\"\\t{word}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "uBpBgC1YtXg-"
	},
	"source": [
	"Arguably, this is more what one has in mind when one thinks about the shortest and longest words in a corpus."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "6ccEeg4Atfcd"
	},
	"source": [
	"## Stretch goal 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Zd7X25r1tegi",
	"outputId": "9daf7a60-b327-4b84-819b-810a134162a7"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The ten shortest words in the corpus are:\n",
	"\ta\n",
	"\tgo\n",
	"\t24\n",
	"\ton\n",
	"\tbe\n",
	"\tdo\n",
	"\tno\n",
	"\twe\n",
	"\tse\n",
	"\tto\n",
	"\n",
	"The ten longest words in the corpus are:\n",
	"\twhat-you-see-is-what-you-get\n",
	"\tlotharios-or-lotharias\n",
	"\tkinda-sorta-not-really\n",
	"\tprivate-investigator\n",
	"\tbehind-the-scenes\n",
	"\tless-conventional\n",
	"\tunder-appreciated\n",
	"\tfairytale-worthy\n",
	"\talways-in-a-rush\n",
	"\tresponsibilities\n"
	]
	}
	],
	"source": [
	"import heapq\n",
	"\n",
	"\n",
	"lexicon = set()\n",
	"with open(\"scorpio.txt\", \"r\") as source:\n",
	" for line in source:\n",
	" tokens = nltk.word_tokenize(line)\n",
	" lexicon.update(\n",
	" token.casefold() for token in tokens if token not in punctuation\n",
	" )\n",
	"selection = heapq.nsmallest(10, lexicon, key=len)\n",
	"print(\"The ten shortest words in the corpus are:\")\n",
	"for word in selection:\n",
	" print(f\"\\t{word}\")\n",
	"print() # Blank line. It looks nice.\n",
	"selection = heapq.nlargest(10, lexicon, key=len)\n",
	"print(\"The ten longest words in the corpus are:\")\n",
	"for word in selection:\n",
	" print(f\"\\t{word}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "EePR2GSUuTGD"
	},
	"source": [
	"Whereas sorting the entire lexicon has complexity $O(n \\log n)$, finding the $k$ longest or shortest words with the heap operations as complexity $O(n \\log k)$. When $n$ (the size of the list) is quite large and $k$ (the number of desired elements) is quite a bit smaller, this is a substantial improvement."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "6X07X3swvM_o"
	},
	"source": [
	"## Part 2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "UVewIHyfvN1E",
	"outputId": "0bf3026e-1703-4d70-8545-d2aeb6d12ef6"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" % Total % Received % Xferd Average Speed Time Time Time Current\n",
	" Dload Upload Total Spent Left Speed\n",
	"100 271 100 271 0 0 1328 0 --:--:-- --:--:-- --:--:-- 1328\n"
	]
	}
	],
	"source": [
	"! curl -O https://www.wellformedness.com/courses/LING78100/data/inventory.txt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"id": "446jDCPW2jWr"
	},
	"outputs": [],
	"source": [
	"inventory = {}\n",
	"with open(\"inventory.txt\", \"r\") as source:\n",
	" for line in source:\n",
	" line = line.rstrip()\n",
	" if \" \" in line:\n",
	" item, count_str = line.split()\n",
	" # Not strictly necessary, but makes things easier to read.\n",
	" item = item.replace(\"-\", \" \")\n",
	" count = int(count_str)\n",
	" inventory[item] = count\n",
	" else:\n",
	" # Ditto.\n",
	" item = line.replace(\"-\", \" \")\n",
	" inventory[item] = 1\n",
	"assert inventory[\"Hardtack\"] == 5\n",
	"assert \"Sword Of Damocles\" in inventory\n",
	"assert \"Killer Whale Potion\" not in inventory\n",
	"assert \"Bloodsword\" not in inventory"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Part 3"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Your reflection here."
	]
	}
	],
	"metadata": {
	"colab": {
	"collapsed_sections": [],
	"provenance": []
	},
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}