Skip to content

Instantly share code, notes, and snippets.

@simon2016bht
Last active June 17, 2020 03:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simon2016bht/e51f5940ff931434c4fdd7bda719d3df to your computer and use it in GitHub Desktop.
Save simon2016bht/e51f5940ff931434c4fdd7bda719d3df to your computer and use it in GitHub Desktop.
tag_HSK_level.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.1"
},
"colab": {
"name": "tag_HSK_level.ipynb",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/simon2016bht/e51f5940ff931434c4fdd7bda719d3df/tag_hsk_level-1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "tx9nlTGBFdOh",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 266
},
"outputId": "c6ec94fd-f658-4f2d-fafb-a56f87fb7c93"
},
"source": [
"print(\"Input some Chinese text:\")\n",
"text=input()\n",
"# text='一家人在吃饭。 儿子问:“爸爸,虫子能吃吗?” 爸爸说:“儿子, 妈妈做的饭菜好吃吗?” 儿子说:“很好吃!” 爸爸说:“那么,你好好吃饭。 吃饭的时候,不要说话。好吗?” 儿子说:“好的。”'\n",
"# text='在一个商店,他看到一只小猫,对不起,大家到了清华大学'\n",
"# print(text)\n",
"# ==============================\n",
"\n",
"!pip install wget termcolor jieba\n",
"import json\n",
"import termcolor\n",
"import jieba\n",
"import wget\n",
"from pathlib import Path\n",
"import shutil\n",
"import os.path\n",
"\n",
"# check if the HSK files exist \n",
"Path(\"./assets\").mkdir(parents=True, exist_ok=True)\n",
"if not os.path.exists('assets/hsk-level-1.json'):\n",
" wget.download('https://raw.githubusercontent.com/simon2016bht/TagHskWords/master/assets/hsk-level-1.json')\n",
" shutil.move('./hsk-level-1.json', 'assets/hsk-level-1.json')\n",
"if not os.path.exists('assets/hsk-level-2.json'):\n",
" wget.download('https://raw.githubusercontent.com/simon2016bht/TagHskWords/master/assets/hsk-level-2.json')\n",
" shutil.move('./hsk-level-2.json', 'assets/hsk-level-2.json')\n",
"if not os.path.exists('assets/hsk-level-3.json'):\n",
" wget.download('https://raw.githubusercontent.com/simon2016bht/TagHskWords/master/assets/hsk-level-3.json')\n",
" shutil.move('./hsk-level-3.json', 'assets/hsk-level-3.json')\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"# load file contents into lists\n",
"with open('assets/hsk-level-1.json') as file:\n",
" hsk1_data = json.load(file)\n",
"hsk1_words = []\n",
"for item in hsk1_data:\n",
" hsk1_words.append(item['hanzi'])\n",
"\n",
"with open('assets/hsk-level-2.json') as file:\n",
" hsk2_data = json.load(file)\n",
"hsk2_words = []\n",
"for item in hsk2_data:\n",
" hsk2_words.append(item['hanzi'])\n",
"\n",
"with open('assets/hsk-level-3.json') as file:\n",
" hsk3_data = json.load(file)\n",
"hsk3_words = []\n",
"for item in hsk3_data:\n",
" hsk3_words.append(item['hanzi'])\n",
"# =================================\n",
"\n",
"# tag words which is of HSK 1,2,3\n",
"tagged_words_hsk1=[]\n",
"tagged_words_hsk2=[]\n",
"tagged_words_hsk3=[]\n",
"\n",
"# using jieba for word segmentation\n",
"for word in jieba.cut(text, cut_all=False):\n",
" #cut word in small pieces\n",
"# print(word,len(word))\n",
" # for each word output from jieba, check the subset of it\n",
" subset_of_word=[]\n",
" if len(word) >= 4:\n",
"# print(word,4)\n",
" subset_of_word.append(word[0])\n",
" subset_of_word.append(word[1])\n",
" subset_of_word.append(word[2])\n",
" subset_of_word.append(word[3])\n",
" subset_of_word.append(word[0:2])\n",
" subset_of_word.append(word[1:3])\n",
" subset_of_word.append(word[2:4])\n",
" subset_of_word.append(word[0:3])\n",
" subset_of_word.append(word[1:4])\n",
" elif len(word) >= 3:\n",
"# print(word,3)\n",
" subset_of_word.append(word[0])\n",
" subset_of_word.append(word[1])\n",
" subset_of_word.append(word[2])\n",
" subset_of_word.append(word[0:2])\n",
" subset_of_word.append(word[1:3])\n",
" elif len(word)>=2:\n",
"# print(word,2)\n",
" subset_of_word.append(word[0])\n",
" subset_of_word.append(word[1])\n",
"\n",
"# check the word directly from jieba \n",
" if word in hsk1_words and word not in tagged_words_hsk1:\n",
" tagged_words_hsk1.append(word)\n",
" elif word in hsk2_words and word not in tagged_words_hsk2:\n",
" tagged_words_hsk2.append(word)\n",
" elif word in hsk3_words and word not in tagged_words_hsk3:\n",
" tagged_words_hsk3.append(word)\n",
" \n",
" \n",
"# also check subset of the word \n",
" for i in subset_of_word:\n",
"# print(i)\n",
" if i in hsk1_words and i not in tagged_words_hsk1:\n",
" tagged_words_hsk1.append(i)\n",
" if i in hsk2_words and i not in tagged_words_hsk2:\n",
" tagged_words_hsk2.append(i)\n",
" if i in hsk3_words and i not in tagged_words_hsk3:\n",
" tagged_words_hsk3.append(i)\n",
"# print(\"=======================\")\n",
"# print(\"HSK1:\",tagged_words_hsk1)\n",
"# print(\"HSK2:\",tagged_words_hsk2)\n",
"# print(\"HSK3:\",tagged_words_hsk3)\n",
"\n",
"# ====================================\n",
"\n",
"\n",
"# Create list of flags for each HSK level\n",
"\n",
"# initialize flag as list of 0\n",
"hsk1_flag=[0]*len(text)\n",
"hsk2_flag=[0]*len(text)\n",
"hsk3_flag=[0]*len(text)\n",
"\n",
"\n",
"## flag a slice of list according to the length of the HSK word\n",
"def tag(flag_list_name,starting_position, length, hsk_level):\n",
" for i in range(length):\n",
" flag_list_name[starting_position+i]=hsk_level\n",
" None\n",
"\n",
"# going through the text\n",
"for cursor_position in enumerate(text):\n",
" # test word from one syllable to 4 syllables, flag of longer word will override short word in the same level\n",
" window=text[cursor_position[0]:cursor_position[0]+4]\n",
" # check if the word size is as expected; avoid out of range problems at the end of the text\n",
" if len(window) != 4:\n",
" None\n",
" elif window in tagged_words_hsk1:\n",
" tag(hsk1_flag,cursor_position[0],4,1)\n",
" elif window in tagged_words_hsk2:\n",
" tag(hsk2_flag,cursor_position[0],4,2)\n",
" elif window in tagged_words_hsk3:\n",
" tag(hsk3_flag,cursor_position[0],4,3)\n",
" \n",
" window=text[cursor_position[0]:cursor_position[0]+3] \n",
" if len(window) != 3:\n",
" None \n",
" elif window in tagged_words_hsk1:\n",
"# print(window) \n",
" tag(hsk1_flag,cursor_position[0],3,1)\n",
" elif window in tagged_words_hsk2:\n",
" tag(hsk2_flag,cursor_position[0],3,2)\n",
" elif window in tagged_words_hsk3:\n",
" tag(hsk3_flag,cursor_position[0],3,3)\n",
"\n",
" window=text[cursor_position[0]:cursor_position[0]+2]\n",
" if len(window) != 2:\n",
" None\n",
" elif window in tagged_words_hsk1:\n",
"# print(window) \n",
" tag(hsk1_flag,cursor_position[0],2,1)\n",
" elif window in tagged_words_hsk2:\n",
" tag(hsk2_flag,cursor_position[0],2,2)\n",
" elif window in tagged_words_hsk3:\n",
" tag(hsk3_flag,cursor_position[0],2,3)\n",
"\n",
" window=text[cursor_position[0]:cursor_position[0]+1] \n",
" if window in tagged_words_hsk1:\n",
" tag(hsk1_flag,cursor_position[0],1,1)\n",
" elif window in tagged_words_hsk2:\n",
" tag(hsk2_flag,cursor_position[0],1,2)\n",
" elif window in tagged_words_hsk3:\n",
" tag(hsk3_flag,cursor_position[0],1,3)\n",
"\n",
"\n",
"# # check tagging result for each HSK level\n",
"# for i in enumerate(text):\n",
"# print(i[0],text[i[0]],hsk1_flag[i[0]], hsk2_flag[i[0]], hsk3_flag[i[0]])\n",
"\n",
"\n",
"# ======================================\n",
"\n",
"## combine flags and assign font color and background color to each character\n",
"# Available text colors: red, green, yellow, blue, magenta, cyan, white.\n",
"HSK1_color = 'red'\n",
"HSK2_color = 'green'\n",
"HSK3_color = 'yellow'\n",
"\n",
"combined_flag = []\n",
"for i in enumerate(text):\n",
" d = {'character':text[i[0]],'font_color':None, 'bg_color':None}\n",
" combined_flag.append(d)\n",
"\n",
"for (cursor_position,character) in enumerate(text):\n",
"# print(cursor_position, character,hsk1_flag[cursor_position])\n",
" if hsk1_flag[cursor_position] != 0:\n",
" combined_flag[cursor_position]['font_color'] = HSK1_color\n",
" # for higher HSK level word, first check if it is already tagged. If so, using background color.\n",
" if hsk2_flag[cursor_position] != 0:\n",
" if combined_flag[cursor_position]['font_color'] == None:\n",
" combined_flag[cursor_position]['font_color'] = HSK2_color\n",
" elif combined_flag[cursor_position]['bg_color'] == None:\n",
" combined_flag[cursor_position]['bg_color'] = 'on_' + HSK2_color\n",
"\n",
" if hsk3_flag[cursor_position] != 0:\n",
" if combined_flag[cursor_position]['font_color'] == None:\n",
" combined_flag[cursor_position]['font_color'] = HSK3_color\n",
" elif combined_flag[cursor_position]['bg_color'] == None:\n",
" combined_flag[cursor_position]['bg_color'] = 'on_' + HSK3_color\n",
" \n",
"# =======================================\n",
"\n",
"\n",
"# output text according to the combined flag\n",
"print(\"Colored text (red for HSK1, green for HSK2, yellow for HSK3):\\n---\")\n",
"for i in enumerate(text):\n",
"# print(i,combined_flag[i[0]]['font_color'], combined_flag[i[0]]['bg_color'])\n",
" colored_word = termcolor.colored(i[1], color=combined_flag[i[0]]['font_color'], on_color=combined_flag[i[0]]['bg_color'])\n",
" print(colored_word, end=\"\")\n",
"print(\"\\n---\")\n",
"print('HSK3 words:', tagged_words_hsk3)\n",
"print('HSK2 words:', tagged_words_hsk2)\n",
"print('HSK1 words:', tagged_words_hsk1)"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"Input some Chinese text:\n",
"我说爸爸的手表可不可以跑慢一点 给他更多的时间可以多看一点, 早上可以晚一点,起床多睡一点, (少累一点)我只希望这一点一点 我说爸爸的手表可不可以跑慢一点 给他更多的时间到处多转一些, 可以少担一点,我的心放松一点 我只希望这一点一点\n",
"Requirement already satisfied: wget in /usr/local/lib/python3.6/dist-packages (3.2)\n",
"Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (1.1.0)\n",
"Requirement already satisfied: jieba in /usr/local/lib/python3.6/dist-packages (0.42.1)\n",
"Colored text (red for HSK1, green for HSK2, yellow for HSK3):\n",
"---\n",
"\u001b[31m我\u001b[0m\u001b[31m说\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m的\u001b[0m\u001b[32m手\u001b[0m\u001b[32m表\u001b[0m可\u001b[0m\u001b[31m不\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[32m跑\u001b[0m\u001b[32m慢\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[32m给\u001b[0m\u001b[31m他\u001b[0m\u001b[33m更\u001b[0m\u001b[31m多\u001b[0m\u001b[31m的\u001b[0m\u001b[42m\u001b[31m时\u001b[0m\u001b[32m间\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[31m多\u001b[0m\u001b[31m看\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m \u001b[0m\u001b[32m早\u001b[0m\u001b[42m\u001b[31m上\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[32m晚\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m\u001b[32m起\u001b[0m\u001b[32m床\u001b[0m\u001b[31m多\u001b[0m\u001b[31m睡\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m \u001b[0m(\u001b[0m\u001b[31m少\u001b[0m\u001b[32m累\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m)\u001b[0m\u001b[31m我\u001b[0m\u001b[33m只\u001b[0m\u001b[32m希\u001b[0m\u001b[32m望\u001b[0m\u001b[31m这\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[31m我\u001b[0m\u001b[31m说\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m爸\u001b[0m\u001b[31m的\u001b[0m\u001b[32m手\u001b[0m\u001b[32m表\u001b[0m可\u001b[0m\u001b[31m不\u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[32m跑\u001b[0m\u001b[32m慢\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[32m给\u001b[0m\u001b[31m他\u001b[0m\u001b[33m更\u001b[0m\u001b[31m多\u001b[0m\u001b[31m的\u001b[0m\u001b[42m\u001b[31m时\u001b[0m\u001b[32m间\u001b[0m\u001b[32m到\u001b[0m处\u001b[0m\u001b[31m多\u001b[0m转\u001b[0m\u001b[31m一\u001b[0m\u001b[31m些\u001b[0m,\u001b[0m \u001b[0m\u001b[32m可\u001b[0m\u001b[32m以\u001b[0m\u001b[31m少\u001b[0m担\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m,\u001b[0m\u001b[31m我\u001b[0m\u001b[31m的\u001b[0m心\u001b[0m\u001b[33m放\u001b[0m松\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m \u001b[0m\u001b[31m我\u001b[0m\u001b[33m只\u001b[0m\u001b[32m希\u001b[0m\u001b[32m望\u001b[0m\u001b[31m这\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m\u001b[31m一\u001b[0m\u001b[31m点\u001b[0m\n",
"---\n",
"HSK3 words: ['更', '只', '放']\n",
"HSK2 words: ['手表', '手', '可以', '跑', '慢', '给', '时间', '间', '早上', '早', '晚', '起床', '起', '累', '希望', '到']\n",
"HSK1 words: ['我', '说', '爸爸', '的', '不', '一', '点', '他', '多', '时', '看', '上', '睡', '少', '这', '些']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Bzw1nausIT0t",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "uB95a01iIUWB",
"colab_type": "code",
"colab": {}
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment