kylebgorman/HW5-solution.ipynb Secret

## HW5-solution.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_Vz0Of0pRv0N"
   },
   "source": [
    "# HW5 solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "XUApqES3RyMl"
   },
   "outputs": [],
   "source": [
    "from typing import List"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "i0Ng0rRzQaPp"
   },
   "source": [
    "## Part 1: a sniffing function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "id": "qVXs4F8dQp85"
   },
   "outputs": [],
   "source": [
    "import chardet\n",
    "\n",
    "\n",
    "def sniff(filepath: str) -> str:\n",
    "    \"\"\"Sniffs the best encoding using chardet.\n",
    "    \n",
    "    Args:\n",
    "       filepath: path to a file to be sniffed.\n",
    "\n",
    "    Returns:\n",
    "       The best guess encoding string.\n",
    "    \"\"\"\n",
    "    with open(filepath, \"rb\") as source:\n",
    "        result = chardet.detect(source.read())\n",
    "    return result[\"encoding\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "QXurUKXoROIE",
    "outputId": "12cd3d2c-6c1f-409d-8c30-9c90fa0f79e1"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100  1289  100  1289    0     0    602      0  0:00:02  0:00:02 --:--:--   602\n"
     ]
    }
   ],
   "source": [
    "! curl -O https://www.wellformedness.com/courses/LING78100/data/mystery.txt\n",
    "\n",
    "assert sniff(\"mystery.txt\") == \"SHIFT_JIS\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "GTTGNEp4emK2"
   },
   "source": [
    "## Part 2: word frequencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Km8jUOJpfJN4",
    "outputId": "1c3c2e8f-deef-4077-83e5-d7bec1050a2d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 80949  100 80949    0     0  58856      0  0:00:01  0:00:01 --:--:-- 58872\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /home/kbg/nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
     ]
    }
   ],
   "source": [
    "# Setup stuff.\n",
    "! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n",
    "\n",
    "import nltk\n",
    "\n",
    "\n",
    "assert nltk.download(\"punkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "iME5v3f0eoj9",
    "outputId": "8d78a876-95c6-4a6a-a9b2-405f66de2e8f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      ".: 857\n",
      "you: 848\n",
      ",: 603\n",
      "to: 542\n",
      "the: 481\n",
      "your: 409\n",
      "and: 389\n",
      "a: 355\n",
      "it: 289\n",
      "of: 238\n"
     ]
    }
   ],
   "source": [
    "import collections\n",
    "\n",
    "\n",
    "frequencies = collections.Counter()\n",
    "with open(\"scorpio.txt\", \"r\") as source:\n",
    "    for line in source:\n",
    "        # Note that the tokenizer strips trailing whitespace for you.\n",
    "        tokens = nltk.word_tokenize(line)\n",
    "        folded_tokens = [token.casefold() for token in tokens]\n",
    "        frequencies.update(folded_tokens)\n",
    "for word, freq in frequencies.most_common(10):\n",
    "    print(f\"{word}: {freq:,}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "LOmAQQcWjWwl"
   },
   "source": [
    "## Part 3: unique words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "YkGGIMaglQ87",
    "outputId": "aa5893c5-f590-469a-bb84-0f6e988053c7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 2,529 unique words in the corpus.\n"
     ]
    }
   ],
   "source": [
    "words = set()\n",
    "with open(\"scorpio.txt\", \"r\") as source:\n",
    "    for line in source:\n",
    "        # Note that the tokenizer strips trailing whitespace for you.\n",
    "        tokens = nltk.word_tokenize(line)\n",
    "        folded_tokens = [token.casefold() for token in tokens]\n",
    "        words.update(folded_tokens)\n",
    "print(f\"There are {len(words):,} unique words in the corpus.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Part 4: reflection"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Your reflection here."
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "_Vz0Of0pRv0N"
	},
	"source": [
	"# HW5 solution"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"id": "XUApqES3RyMl"
	},
	"outputs": [],
	"source": [
	"from typing import List"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "i0Ng0rRzQaPp"
	},
	"source": [
	"## Part 1: a sniffing function"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"id": "qVXs4F8dQp85"
	},
	"outputs": [],
	"source": [
	"import chardet\n",
	"\n",
	"\n",
	"def sniff(filepath: str) -> str:\n",
	" \"\"\"Sniffs the best encoding using chardet.\n",
	" \n",
	" Args:\n",
	" filepath: path to a file to be sniffed.\n",
	"\n",
	" Returns:\n",
	" The best guess encoding string.\n",
	" \"\"\"\n",
	" with open(filepath, \"rb\") as source:\n",
	" result = chardet.detect(source.read())\n",
	" return result[\"encoding\"]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "QXurUKXoROIE",
	"outputId": "12cd3d2c-6c1f-409d-8c30-9c90fa0f79e1"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" % Total % Received % Xferd Average Speed Time Time Time Current\n",
	" Dload Upload Total Spent Left Speed\n",
	"100 1289 100 1289 0 0 602 0 0:00:02 0:00:02 --:--:-- 602\n"
	]
	}
	],
	"source": [
	"! curl -O https://www.wellformedness.com/courses/LING78100/data/mystery.txt\n",
	"\n",
	"assert sniff(\"mystery.txt\") == \"SHIFT_JIS\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "GTTGNEp4emK2"
	},
	"source": [
	"## Part 2: word frequencies"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Km8jUOJpfJN4",
	"outputId": "1c3c2e8f-deef-4077-83e5-d7bec1050a2d"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" % Total % Received % Xferd Average Speed Time Time Time Current\n",
	" Dload Upload Total Spent Left Speed\n",
	"100 80949 100 80949 0 0 58856 0 0:00:01 0:00:01 --:--:-- 58872\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package punkt to /home/kbg/nltk_data...\n",
	"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
	]
	}
	],
	"source": [
	"# Setup stuff.\n",
	"! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n",
	"\n",
	"import nltk\n",
	"\n",
	"\n",
	"assert nltk.download(\"punkt\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "iME5v3f0eoj9",
	"outputId": "8d78a876-95c6-4a6a-a9b2-405f66de2e8f"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	".: 857\n",
	"you: 848\n",
	",: 603\n",
	"to: 542\n",
	"the: 481\n",
	"your: 409\n",
	"and: 389\n",
	"a: 355\n",
	"it: 289\n",
	"of: 238\n"
	]
	}
	],
	"source": [
	"import collections\n",
	"\n",
	"\n",
	"frequencies = collections.Counter()\n",
	"with open(\"scorpio.txt\", \"r\") as source:\n",
	" for line in source:\n",
	" # Note that the tokenizer strips trailing whitespace for you.\n",
	" tokens = nltk.word_tokenize(line)\n",
	" folded_tokens = [token.casefold() for token in tokens]\n",
	" frequencies.update(folded_tokens)\n",
	"for word, freq in frequencies.most_common(10):\n",
	" print(f\"{word}: {freq:,}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "LOmAQQcWjWwl"
	},
	"source": [
	"## Part 3: unique words"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "YkGGIMaglQ87",
	"outputId": "aa5893c5-f590-469a-bb84-0f6e988053c7"
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"There are 2,529 unique words in the corpus.\n"
	]
	}
	],
	"source": [
	"words = set()\n",
	"with open(\"scorpio.txt\", \"r\") as source:\n",
	" for line in source:\n",
	" # Note that the tokenizer strips trailing whitespace for you.\n",
	" tokens = nltk.word_tokenize(line)\n",
	" folded_tokens = [token.casefold() for token in tokens]\n",
	" words.update(folded_tokens)\n",
	"print(f\"There are {len(words):,} unique words in the corpus.\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Part 4: reflection"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Your reflection here."
	]
	}
	],
	"metadata": {
	"colab": {
	"collapsed_sections": [],
	"provenance": []
	},
	"kernelspec": {
	"display_name": "Python 3 (ipykernel)",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}