Copy of kanji deck.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Copy of kanji deck.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyNFzKbad9KS072j5FvGijrL", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/cademcniven/6b10c40652a36a8be2bc14d19f69a5c1/copy-of-kanji-deck.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eQaBFCYe7Rim", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "1034a637-41e4-442b-a192-c1db9769adcf" | |
}, | |
"source": [ | |
"from google.colab import drive\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"#mount google drive so I can grab files from there\n", | |
"drive.mount('/content/drive', force_remount=True)\n", | |
"\n", | |
"kanji = pd.read_csv(\"/content/drive/My Drive/kanji.csv\")\n", | |
"deck = pd.read_csv(\"/content/drive/My Drive/morphy.csv\")\n", | |
"\n", | |
"kanji.rename(columns={\"keyword\": \"readingEN\"}, inplace=True)\n", | |
"deck = deck.astype(str)\n", | |
"\n", | |
"#make a KLC order dictionary\n", | |
"kanjiFreq = dict(zip(kanji.kanji, kanji.id))\n", | |
"\n", | |
"deck\n", | |
"\n", | |
"# assign a \"tempID\" to each word, which is the number of the highest KLC ID kanji in the word\n", | |
"for index, row in deck.iterrows():\n", | |
" #split the word into the individual kanji\n", | |
" unknowns = str(row['MorphMan_Unknowns'])\n", | |
"\n", | |
" wordList = unknowns.split(\", \")\n", | |
" largestKanjiFreq = 0\n", | |
" for word in wordList:\n", | |
" #loop through every kanji in the word to find which one has the highest number\n", | |
" #in the frequency index\n", | |
" for character in word:\n", | |
" temp = kanjiFreq.get(str(character))\n", | |
" if (temp is not None and temp > largestKanjiFreq):\n", | |
" largestKanjiFreq = temp\n", | |
" deck.at[index, \"word\"] = word\n", | |
" \n", | |
" deck.at[index, \"tempId\"] = largestKanjiFreq\n", | |
"\n", | |
"deck.drop_duplicates(subset=[\"word\"], keep='first', inplace=True)\n", | |
"\n", | |
"deck\n", | |
"\n", | |
"#export df to a new csv file\n", | |
"deck.to_csv('/content/drive/My Drive/newmorphy.csv', index=False)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Mounted at /content/drive\n" | |
], | |
"name": "stdout" | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (3,8) have mixed types.Specify dtype option on import or set low_memory=False.\n", | |
" interactivity=interactivity, compiler=compiler, result=result)\n" | |
], | |
"name": "stderr" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment