Skip to content

Instantly share code, notes, and snippets.

@aseaday
Created January 28, 2019 05:49
Show Gist options
  • Save aseaday/936fe6558b857fbfe2af3226828edbf2 to your computer and use it in GitHub Desktop.
Save aseaday/936fe6558b857fbfe2af3226828edbf2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import itertools\n",
"import json\n",
"from itertools import islice\n",
"def find_all(a_str, sub):\n",
" start = 0\n",
" while True:\n",
" start = a_str.find(sub, start)\n",
" if start == -1: return\n",
" yield start\n",
" start += len(sub) # use start += 1 to find overlapping matches"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"def deal_one_term(item):\n",
" term = item['term']\n",
" res = []\n",
" for s in item['bills']:\n",
" if s['sentense'][-1] == '。':\n",
" sentence = s['sentense']\n",
" ares = []\n",
" for c in sentence:\n",
" ares.append(c + ' ' + 'O')\n",
" idx = find_all(sentence, term)\n",
" for i in idx:\n",
" ares[i] = sentence[i] + ' ' + 'B-WXA'\n",
" for j in range(1, len(term)):\n",
" ares[i + j] = sentence[i + j] + ' ' + 'I-WXA'\n",
" res.append(ares)\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"r = []\n",
"with open(\"term.txt\", \"r\") as f:\n",
" for i in islice(f.readlines(), 200000):\n",
" try:\n",
" term_data_item = json.loads(i)\n",
" r.extend(deal_one_term(term_data_item))\n",
" except ValueError as e:\n",
" continue"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"24278"
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(r)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"from random import shuffle"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['最 O',\n",
" '近 O',\n",
" '一 O',\n",
" '个 O',\n",
" '月 O',\n",
" '天 B-WXA',\n",
" '宇 I-WXA',\n",
" '大 I-WXA',\n",
" '陆 I-WXA',\n",
" '发 O',\n",
" '生 O',\n",
" '了 O',\n",
" '几 O',\n",
" '件 O',\n",
" '重 O',\n",
" '要 O',\n",
" '的 O',\n",
" '大 O',\n",
" '事 O',\n",
" '。 O']]"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"r[1:2]"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"shuffle(r)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"def write_file(r,filename):\n",
" wlines = []\n",
" for s in r:\n",
" s = '\\n'.join(s)\n",
" s = s + '\\n\\n'\n",
" wlines.append(s)\n",
" with open(filename, \"w\") as f:\n",
" f.writelines(wlines)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"write_file(r[1:20000], \"train.txt\")"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"write_file(r[20000:24000], \"eval.txt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment