aseaday/Untitled.ipynb

## Untitled.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "import itertools\n",
    "import json\n",
    "from itertools import islice\n",
    "def find_all(a_str, sub):\n",
    "    start = 0\n",
    "    while True:\n",
    "        start = a_str.find(sub, start)\n",
    "        if start == -1: return\n",
    "        yield start\n",
    "        start += len(sub) # use start += 1 to find overlapping matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "def deal_one_term(item):\n",
    "    term = item['term']\n",
    "    res = []\n",
    "    for s in item['bills']:\n",
    "        if s['sentense'][-1] == '。':\n",
    "            sentence = s['sentense']\n",
    "            ares = []\n",
    "            for c in sentence:\n",
    "                ares.append(c + ' ' + 'O')\n",
    "            idx = find_all(sentence, term)\n",
    "            for i in idx:\n",
    "                ares[i] = sentence[i] + ' ' + 'B-WXA'\n",
    "                for j in range(1, len(term)):\n",
    "                    ares[i + j] = sentence[i + j] + ' ' + 'I-WXA'\n",
    "            res.append(ares)\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = []\n",
    "with open(\"term.txt\", \"r\") as f:\n",
    "    for i in islice(f.readlines(), 200000):\n",
    "        try:\n",
    "            term_data_item = json.loads(i)\n",
    "            r.extend(deal_one_term(term_data_item))\n",
    "        except ValueError as e:\n",
    "            continue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24278"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "from random import shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['最 O',\n",
       "  '近 O',\n",
       "  '一 O',\n",
       "  '个 O',\n",
       "  '月 O',\n",
       "  '天 B-WXA',\n",
       "  '宇 I-WXA',\n",
       "  '大 I-WXA',\n",
       "  '陆 I-WXA',\n",
       "  '发 O',\n",
       "  '生 O',\n",
       "  '了 O',\n",
       "  '几 O',\n",
       "  '件 O',\n",
       "  '重 O',\n",
       "  '要 O',\n",
       "  '的 O',\n",
       "  '大 O',\n",
       "  '事 O',\n",
       "  '。 O']]"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r[1:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "shuffle(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_file(r,filename):\n",
    "    wlines = []\n",
    "    for s in r:\n",
    "        s = '\\n'.join(s)\n",
    "        s = s + '\\n\\n'\n",
    "        wlines.append(s)\n",
    "    with open(filename, \"w\") as f:\n",
    "        f.writelines(wlines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_file(r[1:20000], \"train.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "write_file(r[20000:24000], \"eval.txt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [],
	"source": [
	"import itertools\n",
	"import json\n",
	"from itertools import islice\n",
	"def find_all(a_str, sub):\n",
	" start = 0\n",
	" while True:\n",
	" start = a_str.find(sub, start)\n",
	" if start == -1: return\n",
	" yield start\n",
	" start += len(sub) # use start += 1 to find overlapping matches"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"metadata": {},
	"outputs": [],
	"source": [
	"def deal_one_term(item):\n",
	" term = item['term']\n",
	" res = []\n",
	" for s in item['bills']:\n",
	" if s['sentense'][-1] == '。':\n",
	" sentence = s['sentense']\n",
	" ares = []\n",
	" for c in sentence:\n",
	" ares.append(c + ' ' + 'O')\n",
	" idx = find_all(sentence, term)\n",
	" for i in idx:\n",
	" ares[i] = sentence[i] + ' ' + 'B-WXA'\n",
	" for j in range(1, len(term)):\n",
	" ares[i + j] = sentence[i + j] + ' ' + 'I-WXA'\n",
	" res.append(ares)\n",
	" return res"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 77,
	"metadata": {},
	"outputs": [],
	"source": [
	"r = []\n",
	"with open(\"term.txt\", \"r\") as f:\n",
	" for i in islice(f.readlines(), 200000):\n",
	" try:\n",
	" term_data_item = json.loads(i)\n",
	" r.extend(deal_one_term(term_data_item))\n",
	" except ValueError as e:\n",
	" continue"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 78,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"24278"
	]
	},
	"execution_count": 78,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(r)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 79,
	"metadata": {},
	"outputs": [],
	"source": [
	"from random import shuffle"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 80,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[['最 O',\n",
	" '近 O',\n",
	" '一 O',\n",
	" '个 O',\n",
	" '月 O',\n",
	" '天 B-WXA',\n",
	" '宇 I-WXA',\n",
	" '大 I-WXA',\n",
	" '陆 I-WXA',\n",
	" '发 O',\n",
	" '生 O',\n",
	" '了 O',\n",
	" '几 O',\n",
	" '件 O',\n",
	" '重 O',\n",
	" '要 O',\n",
	" '的 O',\n",
	" '大 O',\n",
	" '事 O',\n",
	" '。 O']]"
	]
	},
	"execution_count": 80,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"r[1:2]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 81,
	"metadata": {},
	"outputs": [],
	"source": [
	"shuffle(r)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 94,
	"metadata": {},
	"outputs": [],
	"source": [
	"def write_file(r,filename):\n",
	" wlines = []\n",
	" for s in r:\n",
	" s = '\\n'.join(s)\n",
	" s = s + '\\n\\n'\n",
	" wlines.append(s)\n",
	" with open(filename, \"w\") as f:\n",
	" f.writelines(wlines)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 95,
	"metadata": {},
	"outputs": [],
	"source": [
	"write_file(r[1:20000], \"train.txt\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 96,
	"metadata": {},
	"outputs": [],
	"source": [
	"write_file(r[20000:24000], \"eval.txt\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}