-
-
Save kylebgorman/d08af5d7be11b490ec18de790c7f1d67 to your computer and use it in GitHub Desktop.
Methods I HW6 solution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "X5isTxpIrb53" | |
}, | |
"source": [ | |
"# HW6 solution" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "3qNxJXQprXn9" | |
}, | |
"source": [ | |
"## Part 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Ux8G9GR8rTH8", | |
"outputId": "8aa9d93d-4e01-42e1-c454-2365aca92ffe" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" % Total % Received % Xferd Average Speed Time Time Time Current\n", | |
" Dload Upload Total Spent Left Speed\n", | |
"100 80949 100 80949 0 0 136k 0 --:--:-- --:--:-- --:--:-- 136k\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package punkt to /root/nltk_data...\n", | |
"[nltk_data] Package punkt is already up-to-date!\n" | |
] | |
} | |
], | |
"source": [ | |
"! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n", | |
"\n", | |
"import nltk\n", | |
"assert nltk.download(\"punkt\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "9ftxeUi4rjrE", | |
"outputId": "4fb718d8-8b99-4a4f-e31c-b83d7d8fdfd5" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The ten shortest words in the corpus are:\n", | |
"\t'\n", | |
"\t:\n", | |
"\t(\n", | |
"\ta\n", | |
"\t;\n", | |
"\t.\n", | |
"\t-\n", | |
"\t!\n", | |
"\t)\n", | |
"\t,\n", | |
"\n", | |
"The ten longest words in the corpus are:\n", | |
"\tfairytale-worthy\n", | |
"\talways-in-a-rush\n", | |
"\tresponsibilities\n", | |
"\tbehind-the-scenes\n", | |
"\tless-conventional\n", | |
"\tunder-appreciated\n", | |
"\tprivate-investigator\n", | |
"\tlotharios-or-lotharias\n", | |
"\tkinda-sorta-not-really\n", | |
"\twhat-you-see-is-what-you-get\n" | |
] | |
} | |
], | |
"source": [ | |
"lexicon = set()\n", | |
"with open(\"scorpio.txt\", \"r\") as source:\n", | |
" for line in source:\n", | |
" tokens = nltk.word_tokenize(line)\n", | |
" lexicon.update(token.casefold() for token in tokens)\n", | |
"lexicon_by_len = sorted(lexicon, key=len)\n", | |
"print(\"The ten shortest words in the corpus are:\")\n", | |
"for word in lexicon_by_len[:10]:\n", | |
" print(f\"\\t{word}\")\n", | |
"print() # Blank line. It looks nice.\n", | |
"print(\"The ten longest words in the corpus are:\")\n", | |
"for word in lexicon_by_len[-10:]:\n", | |
" print(f\"\\t{word}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "D9vnVEbpswl9" | |
}, | |
"source": [ | |
"## Stretch goal 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "i2_KoPMhsx6_", | |
"outputId": "adf430ee-53c6-4e8b-bff4-7dc5bbe109ff" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The ten shortest words in the corpus are:\n", | |
"\ta\n", | |
"\tgo\n", | |
"\t24\n", | |
"\ton\n", | |
"\tbe\n", | |
"\tdo\n", | |
"\tno\n", | |
"\twe\n", | |
"\tse\n", | |
"\tto\n", | |
"\n", | |
"The ten longest words in the corpus are:\n", | |
"\tfairytale-worthy\n", | |
"\talways-in-a-rush\n", | |
"\tresponsibilities\n", | |
"\tbehind-the-scenes\n", | |
"\tless-conventional\n", | |
"\tunder-appreciated\n", | |
"\tprivate-investigator\n", | |
"\tlotharios-or-lotharias\n", | |
"\tkinda-sorta-not-really\n", | |
"\twhat-you-see-is-what-you-get\n" | |
] | |
} | |
], | |
"source": [ | |
"import string\n", | |
"\n", | |
"\n", | |
"punctuation = frozenset(string.punctuation)\n", | |
"\n", | |
"lexicon = set()\n", | |
"with open(\"scorpio.txt\", \"r\") as source:\n", | |
" for line in source:\n", | |
" tokens = nltk.word_tokenize(line)\n", | |
" # Note change here.\n", | |
" lexicon.update(\n", | |
" token.casefold() for token in tokens if token not in punctuation\n", | |
" )\n", | |
"lexicon_by_len = sorted(lexicon, key=len)\n", | |
"print(\"The ten shortest words in the corpus are:\")\n", | |
"for word in lexicon_by_len[:10]:\n", | |
" print(f\"\\t{word}\")\n", | |
"print()\n", | |
"print(\"The ten longest words in the corpus are:\")\n", | |
"for word in lexicon_by_len[-10:]:\n", | |
" print(f\"\\t{word}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "uBpBgC1YtXg-" | |
}, | |
"source": [ | |
"Arguably, this is more what one has in mind when one thinks about the shortest and longest words in a corpus." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "6ccEeg4Atfcd" | |
}, | |
"source": [ | |
"## Stretch goal 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Zd7X25r1tegi", | |
"outputId": "9daf7a60-b327-4b84-819b-810a134162a7" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The ten shortest words in the corpus are:\n", | |
"\ta\n", | |
"\tgo\n", | |
"\t24\n", | |
"\ton\n", | |
"\tbe\n", | |
"\tdo\n", | |
"\tno\n", | |
"\twe\n", | |
"\tse\n", | |
"\tto\n", | |
"\n", | |
"The ten longest words in the corpus are:\n", | |
"\twhat-you-see-is-what-you-get\n", | |
"\tlotharios-or-lotharias\n", | |
"\tkinda-sorta-not-really\n", | |
"\tprivate-investigator\n", | |
"\tbehind-the-scenes\n", | |
"\tless-conventional\n", | |
"\tunder-appreciated\n", | |
"\tfairytale-worthy\n", | |
"\talways-in-a-rush\n", | |
"\tresponsibilities\n" | |
] | |
} | |
], | |
"source": [ | |
"import heapq\n", | |
"\n", | |
"\n", | |
"lexicon = set()\n", | |
"with open(\"scorpio.txt\", \"r\") as source:\n", | |
" for line in source:\n", | |
" tokens = nltk.word_tokenize(line)\n", | |
" lexicon.update(\n", | |
" token.casefold() for token in tokens if token not in punctuation\n", | |
" )\n", | |
"selection = heapq.nsmallest(10, lexicon, key=len)\n", | |
"print(\"The ten shortest words in the corpus are:\")\n", | |
"for word in selection:\n", | |
" print(f\"\\t{word}\")\n", | |
"print() # Blank line. It looks nice.\n", | |
"selection = heapq.nlargest(10, lexicon, key=len)\n", | |
"print(\"The ten longest words in the corpus are:\")\n", | |
"for word in selection:\n", | |
" print(f\"\\t{word}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EePR2GSUuTGD" | |
}, | |
"source": [ | |
"Whereas sorting the entire lexicon has complexity $O(n \\log n)$, finding the $k$ longest or shortest words with the heap operations as complexity $O(n \\log k)$. When $n$ (the size of the list) is quite large and $k$ (the number of desired elements) is quite a bit smaller, this is a substantial improvement." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "6X07X3swvM_o" | |
}, | |
"source": [ | |
"## Part 2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "UVewIHyfvN1E", | |
"outputId": "0bf3026e-1703-4d70-8545-d2aeb6d12ef6" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" % Total % Received % Xferd Average Speed Time Time Time Current\n", | |
" Dload Upload Total Spent Left Speed\n", | |
"100 271 100 271 0 0 1328 0 --:--:-- --:--:-- --:--:-- 1328\n" | |
] | |
} | |
], | |
"source": [ | |
"! curl -O https://www.wellformedness.com/courses/LING78100/data/inventory.txt" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"id": "446jDCPW2jWr" | |
}, | |
"outputs": [], | |
"source": [ | |
"inventory = {}\n", | |
"with open(\"inventory.txt\", \"r\") as source:\n", | |
" for line in source:\n", | |
" line = line.rstrip()\n", | |
" if \" \" in line:\n", | |
" item, count_str = line.split()\n", | |
" # Not strictly necessary, but makes things easier to read.\n", | |
" item = item.replace(\"-\", \" \")\n", | |
" count = int(count_str)\n", | |
" inventory[item] = count\n", | |
" else:\n", | |
" # Ditto.\n", | |
" item = line.replace(\"-\", \" \")\n", | |
" inventory[item] = 1\n", | |
"assert inventory[\"Hardtack\"] == 5\n", | |
"assert \"Sword Of Damocles\" in inventory\n", | |
"assert \"Killer Whale Potion\" not in inventory\n", | |
"assert \"Bloodsword\" not in inventory" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Part 3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Your reflection here." | |
] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"collapsed_sections": [], | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment