Skip to content

Instantly share code, notes, and snippets.

@kylebgorman
Last active November 1, 2022 15:11
Show Gist options
  • Save kylebgorman/341e6c32801bb8bd7a2fda025c4da21b to your computer and use it in GitHub Desktop.
Save kylebgorman/341e6c32801bb8bd7a2fda025c4da21b to your computer and use it in GitHub Desktop.
Methods I HW5 solution
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "_Vz0Of0pRv0N"
},
"source": [
"# HW5 solution"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "XUApqES3RyMl"
},
"outputs": [],
"source": [
"from typing import List"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "i0Ng0rRzQaPp"
},
"source": [
"## Part 1: a sniffing function"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "qVXs4F8dQp85"
},
"outputs": [],
"source": [
"import chardet\n",
"\n",
"\n",
"def sniff(filepath: str) -> str:\n",
" \"\"\"Sniffs the best encoding using chardet.\n",
" \n",
" Args:\n",
" filepath: path to a file to be sniffed.\n",
"\n",
" Returns:\n",
" The best guess encoding string.\n",
" \"\"\"\n",
" with open(filepath, \"rb\") as source:\n",
" result = chardet.detect(source.read())\n",
" return result[\"encoding\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QXurUKXoROIE",
"outputId": "12cd3d2c-6c1f-409d-8c30-9c90fa0f79e1"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 1289 100 1289 0 0 602 0 0:00:02 0:00:02 --:--:-- 602\n"
]
}
],
"source": [
"! curl -O https://www.wellformedness.com/courses/LING78100/data/mystery.txt\n",
"\n",
"assert sniff(\"mystery.txt\") == \"SHIFT_JIS\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GTTGNEp4emK2"
},
"source": [
"## Part 2: word frequencies"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Km8jUOJpfJN4",
"outputId": "1c3c2e8f-deef-4077-83e5-d7bec1050a2d"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 80949 100 80949 0 0 58856 0 0:00:01 0:00:01 --:--:-- 58872\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/kbg/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
]
}
],
"source": [
"# Setup stuff.\n",
"! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n",
"\n",
"import nltk\n",
"\n",
"\n",
"assert nltk.download(\"punkt\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iME5v3f0eoj9",
"outputId": "8d78a876-95c6-4a6a-a9b2-405f66de2e8f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
".: 857\n",
"you: 848\n",
",: 603\n",
"to: 542\n",
"the: 481\n",
"your: 409\n",
"and: 389\n",
"a: 355\n",
"it: 289\n",
"of: 238\n"
]
}
],
"source": [
"import collections\n",
"\n",
"\n",
"frequencies = collections.Counter()\n",
"with open(\"scorpio.txt\", \"r\") as source:\n",
" for line in source:\n",
" # Note that the tokenizer strips trailing whitespace for you.\n",
" tokens = nltk.word_tokenize(line)\n",
" folded_tokens = [token.casefold() for token in tokens]\n",
" frequencies.update(folded_tokens)\n",
"for word, freq in frequencies.most_common(10):\n",
" print(f\"{word}: {freq:,}\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "LOmAQQcWjWwl"
},
"source": [
"## Part 3: unique words"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YkGGIMaglQ87",
"outputId": "aa5893c5-f590-469a-bb84-0f6e988053c7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 2,529 unique words in the corpus.\n"
]
}
],
"source": [
"words = set()\n",
"with open(\"scorpio.txt\", \"r\") as source:\n",
" for line in source:\n",
" # Note that the tokenizer strips trailing whitespace for you.\n",
" tokens = nltk.word_tokenize(line)\n",
" folded_tokens = [token.casefold() for token in tokens]\n",
" words.update(folded_tokens)\n",
"print(f\"There are {len(words):,} unique words in the corpus.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Part 4: reflection"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Your reflection here."
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment