Skip to content

Instantly share code, notes, and snippets.

@shello
Created June 10, 2020 19:50
Show Gist options
  • Save shello/1439b505f55249538454d0c1ba3cfab9 to your computer and use it in GitHub Desktop.
Save shello/1439b505f55249538454d0c1ba3cfab9 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"PT_LETTERS_BASE = \"abcdefghijklmnopqrstuvwxyz\"\n",
"PT_LETTERS_DIACRITICS = \"áàãâçéêíóõôú\"\n",
"PT_LETTERS_DIACRITICS_CONV = \"aaaaceeiooou\"\n",
"PT_LETTERS_CONV = str.maketrans(PT_LETTERS_DIACRITICS, PT_LETTERS_DIACRITICS_CONV)\n",
"PT_LETTERS_ALL = PT_LETTERS_BASE + PT_LETTERS_DIACRITICS\n",
"PT_DIACRITICS = {\n",
" 'acute': set('áéíóú'),\n",
" 'grave': set('à'),\n",
" 'tilde': set('ãõ'),\n",
" 'circumflex': set('âêô'),\n",
" 'cedil': set('ç'),\n",
"}\n",
"\n",
"MARKER_NOT_LETTER = ' '\n",
"MARKER_UNIQUE = '^'\n",
"MARKER_DIACRITIC_UNIQUE = MARKER_UNIQUE\n",
"MARKER_OVER_10 = '+'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def analyze_pangram(pangram):\n",
" # normalize string by lower-casing it and removing non-letters\n",
" normalized_lower = pangram.lower()\n",
" normalized = ''.join(filter(lambda l: l in PT_LETTERS_ALL, normalized_lower))\n",
" \n",
" # Counters\n",
" normalized_base = normalized.translate(PT_LETTERS_CONV)\n",
" \n",
" letter_counter = Counter(normalized)\n",
" letter_base_counter = Counter(normalized_base)\n",
" \n",
" # Inline preview with markers\n",
" pangram_markers = []\n",
" for letter in normalized_lower:\n",
" # Not counted, not a letter\n",
" if letter not in letter_counter:\n",
" pangram_markers.append(MARKER_NOT_LETTER)\n",
" # Diacritic, unique\n",
" elif letter not in letter_base_counter and letter in letter_counter:\n",
" pangram_markers.append(MARKER_DIACRITIC_UNIQUE)\n",
" # Non diacritic, unique\n",
" elif letter_counter[letter] == 1:\n",
" pangram_markers.append(MARKER_UNIQUE)\n",
" # Letter with one or more duplicates\n",
" else:\n",
" count = letter_base_counter[letter]\n",
" pangram_markers.append(str(count) if count <= 9 else MARKER_OVER_10)\n",
" \n",
" print(pangram)\n",
" print(''.join(pangram_markers))\n",
" print()\n",
" \n",
" total_length = len(normalized)\n",
" print(\"{} letters in length ({} characters).\".format(total_length, len(pangram)))\n",
" \n",
" # Base letters\n",
" letters_present = set(''.join(letter_base_counter))\n",
" letters_missing = sorted(set(PT_LETTERS_BASE) - letters_present)\n",
" \n",
" if letters_missing:\n",
" print(\"Missing letters: {}\".format(', '.join(letters_missing)))\n",
" else:\n",
" print(\"No letters missing.\")\n",
" \n",
" # Diacritics and other diacritics\n",
" diacritics_present = set(letter_counter) & set(PT_LETTERS_DIACRITICS)\n",
" diacritics_missing = list(PT_DIACRITICS.keys())\n",
" \n",
" for diacritic, letters in PT_DIACRITICS.items():\n",
" if letters & diacritics_present:\n",
" diacritics_missing.remove(diacritic)\n",
" \n",
" if diacritics_missing:\n",
" print(\"Missing diacritics: {}\".format(', '.join(diacritics_missing)))\n",
" else:\n",
" print(\"No diacritics missing.\")\n",
" print()\n",
" \n",
" # Duplicate letters\n",
" print(\"Frequency of duplicates:\")\n",
" for letter, count in letter_base_counter.most_common():\n",
" if count <= 1:\n",
" break\n",
" print(\" {} | {: >2d} ({:>4.1f}%) | {}\".format(\n",
" letter, count, round(count / total_length * 100, 1), '#' * count))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Açor frágil, viaja amanhã para Sul, traz o kiwi de nylon àquela bruxa robô.\n",
"+^56 ^6^^44 ^4+^+ +^+3^^ ^+6+ ^34 ^6+^ 5 ^4^4 ^2 3^453 ^^324+ 263^+ 652^ \n",
"\n",
"59 letters in length (75 characters).\n",
"No letters missing.\n",
"No diacritics missing.\n",
"\n",
"Frequency of duplicates:\n",
" a | 13 (22.0%) | #############\n",
" r | 6 (10.2%) | ######\n",
" o | 5 ( 8.5%) | #####\n",
" i | 4 ( 6.8%) | ####\n",
" l | 4 ( 6.8%) | ####\n",
" n | 3 ( 5.1%) | ###\n",
" u | 3 ( 5.1%) | ###\n",
" e | 2 ( 3.4%) | ##\n",
" b | 2 ( 3.4%) | ##\n"
]
}
],
"source": [
"analyze_pangram(\"Açor frágil, viaja amanhã para Sul, traz o kiwi de nylon àquela bruxa robô.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment