Skip to content

Instantly share code, notes, and snippets.

@cademcniven
Created February 28, 2021 17:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cademcniven/6b10c40652a36a8be2bc14d19f69a5c1 to your computer and use it in GitHub Desktop.
Save cademcniven/6b10c40652a36a8be2bc14d19f69a5c1 to your computer and use it in GitHub Desktop.
Copy of kanji deck.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Copy of kanji deck.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNFzKbad9KS072j5FvGijrL",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/cademcniven/6b10c40652a36a8be2bc14d19f69a5c1/copy-of-kanji-deck.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eQaBFCYe7Rim",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "1034a637-41e4-442b-a192-c1db9769adcf"
},
"source": [
"from google.colab import drive\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"#mount google drive so I can grab files from there\n",
"drive.mount('/content/drive', force_remount=True)\n",
"\n",
"kanji = pd.read_csv(\"/content/drive/My Drive/kanji.csv\")\n",
"deck = pd.read_csv(\"/content/drive/My Drive/morphy.csv\")\n",
"\n",
"kanji.rename(columns={\"keyword\": \"readingEN\"}, inplace=True)\n",
"deck = deck.astype(str)\n",
"\n",
"#make a KLC order dictionary\n",
"kanjiFreq = dict(zip(kanji.kanji, kanji.id))\n",
"\n",
"deck\n",
"\n",
"# assign a \"tempID\" to each word, which is the number of the highest KLC ID kanji in the word\n",
"for index, row in deck.iterrows():\n",
" #split the word into the individual kanji\n",
" unknowns = str(row['MorphMan_Unknowns'])\n",
"\n",
" wordList = unknowns.split(\", \")\n",
" largestKanjiFreq = 0\n",
" for word in wordList:\n",
" #loop through every kanji in the word to find which one has the highest number\n",
" #in the frequency index\n",
" for character in word:\n",
" temp = kanjiFreq.get(str(character))\n",
" if (temp is not None and temp > largestKanjiFreq):\n",
" largestKanjiFreq = temp\n",
" deck.at[index, \"word\"] = word\n",
" \n",
" deck.at[index, \"tempId\"] = largestKanjiFreq\n",
"\n",
"deck.drop_duplicates(subset=[\"word\"], keep='first', inplace=True)\n",
"\n",
"deck\n",
"\n",
"#export df to a new csv file\n",
"deck.to_csv('/content/drive/My Drive/newmorphy.csv', index=False)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /content/drive\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (3,8) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
],
"name": "stderr"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment