-
-
Save kylebgorman/341e6c32801bb8bd7a2fda025c4da21b to your computer and use it in GitHub Desktop.
Methods I HW5 solution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_Vz0Of0pRv0N" | |
}, | |
"source": [ | |
"# HW5 solution" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"id": "XUApqES3RyMl" | |
}, | |
"outputs": [], | |
"source": [ | |
"from typing import List" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "i0Ng0rRzQaPp" | |
}, | |
"source": [ | |
"## Part 1: a sniffing function" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"id": "qVXs4F8dQp85" | |
}, | |
"outputs": [], | |
"source": [ | |
"import chardet\n", | |
"\n", | |
"\n", | |
"def sniff(filepath: str) -> str:\n", | |
" \"\"\"Sniffs the best encoding using chardet.\n", | |
" \n", | |
" Args:\n", | |
" filepath: path to a file to be sniffed.\n", | |
"\n", | |
" Returns:\n", | |
" The best guess encoding string.\n", | |
" \"\"\"\n", | |
" with open(filepath, \"rb\") as source:\n", | |
" result = chardet.detect(source.read())\n", | |
" return result[\"encoding\"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "QXurUKXoROIE", | |
"outputId": "12cd3d2c-6c1f-409d-8c30-9c90fa0f79e1" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" % Total % Received % Xferd Average Speed Time Time Time Current\n", | |
" Dload Upload Total Spent Left Speed\n", | |
"100 1289 100 1289 0 0 602 0 0:00:02 0:00:02 --:--:-- 602\n" | |
] | |
} | |
], | |
"source": [ | |
"! curl -O https://www.wellformedness.com/courses/LING78100/data/mystery.txt\n", | |
"\n", | |
"assert sniff(\"mystery.txt\") == \"SHIFT_JIS\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "GTTGNEp4emK2" | |
}, | |
"source": [ | |
"## Part 2: word frequencies" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Km8jUOJpfJN4", | |
"outputId": "1c3c2e8f-deef-4077-83e5-d7bec1050a2d" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" % Total % Received % Xferd Average Speed Time Time Time Current\n", | |
" Dload Upload Total Spent Left Speed\n", | |
"100 80949 100 80949 0 0 58856 0 0:00:01 0:00:01 --:--:-- 58872\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package punkt to /home/kbg/nltk_data...\n", | |
"[nltk_data] Unzipping tokenizers/punkt.zip.\n" | |
] | |
} | |
], | |
"source": [ | |
"# Setup stuff.\n", | |
"! curl -O https://www.wellformedness.com/courses/LING78100/data/scorpio.txt\n", | |
"\n", | |
"import nltk\n", | |
"\n", | |
"\n", | |
"assert nltk.download(\"punkt\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "iME5v3f0eoj9", | |
"outputId": "8d78a876-95c6-4a6a-a9b2-405f66de2e8f" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
".: 857\n", | |
"you: 848\n", | |
",: 603\n", | |
"to: 542\n", | |
"the: 481\n", | |
"your: 409\n", | |
"and: 389\n", | |
"a: 355\n", | |
"it: 289\n", | |
"of: 238\n" | |
] | |
} | |
], | |
"source": [ | |
"import collections\n", | |
"\n", | |
"\n", | |
"frequencies = collections.Counter()\n", | |
"with open(\"scorpio.txt\", \"r\") as source:\n", | |
" for line in source:\n", | |
" # Note that the tokenizer strips trailing whitespace for you.\n", | |
" tokens = nltk.word_tokenize(line)\n", | |
" folded_tokens = [token.casefold() for token in tokens]\n", | |
" frequencies.update(folded_tokens)\n", | |
"for word, freq in frequencies.most_common(10):\n", | |
" print(f\"{word}: {freq:,}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "LOmAQQcWjWwl" | |
}, | |
"source": [ | |
"## Part 3: unique words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "YkGGIMaglQ87", | |
"outputId": "aa5893c5-f590-469a-bb84-0f6e988053c7" | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"There are 2,529 unique words in the corpus.\n" | |
] | |
} | |
], | |
"source": [ | |
"words = set()\n", | |
"with open(\"scorpio.txt\", \"r\") as source:\n", | |
" for line in source:\n", | |
" # Note that the tokenizer strips trailing whitespace for you.\n", | |
" tokens = nltk.word_tokenize(line)\n", | |
" folded_tokens = [token.casefold() for token in tokens]\n", | |
" words.update(folded_tokens)\n", | |
"print(f\"There are {len(words):,} unique words in the corpus.\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Part 4: reflection" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Your reflection here." | |
] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"collapsed_sections": [], | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment