Skip to content

Instantly share code, notes, and snippets.

@Pked01
Last active August 17, 2018 09:33
Show Gist options
  • Save Pked01/e0e2abb0b9f72fe8a8a85f9ab5bfcb49 to your computer and use it in GitHub Desktop.
Save Pked01/e0e2abb0b9f72fe8a8a85f9ab5bfcb49 to your computer and use it in GitHub Desktop.
ABG/text_related/fast_abs_rl/Untitled.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import IPython.display as Disp\n\nimport os,sys\n\nos.environ['DATA'] = '/home/ubuntu/Notebooks/ABG/text_related/fast_abs_rl/data/cnn-dailymail/finished_files/'",
"execution_count": 1,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "'https://github.com/ChenRocks/fast_abs_rl'\n### Options:\n\n - beam_size: number of hypothesis for (diverse) beam search. (use beam_size > 1 to enable reranking)\n - beam_szie=1 to get greedy decoding results (rnn-ext + abs + RL)\n - beam_size=5 is used in the paper for the +rerank model (rnn-ext + abs + RL + rerank)\n - test/val: decode on test/validation dataset"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### run decoding of the full model (RL)\n\n#### optional arguments:\n- -h, --help show this help message and exit\n- --path PATH path to store/eval\n- --model_dir MODEL_DIR root of the full model\n- --val use validation set\n- --test use test set\n- --batch BATCH batch size of faster decoding\n- --beam BEAM beam size for beam-search (reranking included)\n- --div DIV diverse ratio for the diverse beam-search\n- --max_dec_word MAX_DEC_WORD maximun words to be decoded for the abstractor\n- --no-cuda disable GPU training\n\n\npython decode_full_model.py --path=[path/to/save/decoded/files] --model_dir=[path/to/pretrained] --beam=[beam_size] [--test/--val]\n"
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import sys\nimport os\nimport hashlib\nimport subprocess\nimport collections\n\nimport json\nimport tarfile\nimport io\nimport pickle as pkl\n\n\ndm_single_close_quote = '\\u2019' # unicode\ndm_double_close_quote = '\\u201d'\n# acceptable ways to end a sentence\nEND_TOKENS = ['.', '!', '?', '...', \"'\", \"`\", '\"',\n dm_single_close_quote, dm_double_close_quote, \")\"]\n\nall_train_urls = \"url_lists/all_train.txt\"\nall_val_urls = \"url_lists/all_val.txt\"\nall_test_urls = \"url_lists/all_test.txt\"\n\ncnn_tokenized_stories_dir = \"cnn_stories_tokenized\"\ndm_tokenized_stories_dir = \"dm_stories_tokenized\"\nfinished_files_dir = \"finished_files\"\n\n# These are the number of .story files we expect there to be in cnn_stories_dir\n# and dm_stories_dir\nnum_expected_cnn_stories = 92579\nnum_expected_dm_stories = 219506\n\n\ndef tokenize_stories(stories_dir, tokenized_stories_dir):\n \"\"\"Maps a whole directory of .story files to a tokenized version using\n Stanford CoreNLP Tokenizer\n \"\"\"\n print(\"Preparing to tokenize {} to {}...\".format(stories_dir,\n tokenized_stories_dir))\n stories = os.listdir(stories_dir)\n # make IO list file\n print(\"Making list of files to tokenize...\")\n with open(\"mapping.txt\", \"w\") as f:\n for s in stories:\n f.write(\n \"{} \\t {}\\n\".format(\n os.path.join(stories_dir, s),\n os.path.join(tokenized_stories_dir, s)\n )\n )\n command = ['java', 'edu.stanford.nlp.process.PTBTokenizer',\n '-ioFileList', '-preserveLines', 'mapping.txt']\n print(\"Tokenizing {} files in {} and saving in {}...\".format(\n len(stories), stories_dir, tokenized_stories_dir))\n subprocess.call(command)\n print(\"Stanford CoreNLP Tokenizer has finished.\")\n os.remove(\"mapping.txt\")\n\n # Check that the tokenized stories directory contains the same number of\n # files as the original directory\n num_orig = len(os.listdir(stories_dir))\n num_tokenized = len(os.listdir(tokenized_stories_dir))\n if num_orig != num_tokenized:\n raise Exception(\n \"The tokenized stories directory {} contains {} files, but it \"\n \"should contain the same number as {} (which has {} files). Was\"\n \" there an error during tokenization?\".format(\n tokenized_stories_dir, num_tokenized, stories_dir, num_orig)\n )\n print(\"Successfully finished tokenizing {} to {}.\\n\".format(\n stories_dir, tokenized_stories_dir))\n\n\ndef read_story_file(text_file):\n with open(text_file, \"r\") as f:\n # sentences are separated by 2 newlines\n # single newlines might be image captions\n # so will be incomplete sentence\n lines = f.read().split('\\n\\n')\n return lines\n\n\ndef hashhex(s):\n \"\"\"Returns a heximal formated SHA1 hash of the input string.\"\"\"\n h = hashlib.sha1()\n h.update(s.encode())\n return h.hexdigest()\n\n\ndef get_url_hashes(url_list):\n return [hashhex(url) for url in url_list]\n\n\ndef fix_missing_period(line):\n \"\"\"Adds a period to a line that is missing a period\"\"\"\n if \"@highlight\" in line:\n return line\n if line == \"\":\n return line\n if line[-1] in END_TOKENS:\n return line\n return line + \" .\"\n\n\ndef get_art_abs(story_file):\n \"\"\" return as list of sentences\"\"\"\n lines = read_story_file(story_file)\n\n # Lowercase, truncated trailing spaces, and normalize spaces\n lines = [' '.join(line.lower().strip().split()) for line in lines]\n\n # Put periods on the ends of lines that are missing them (this is a problem\n # in the dataset because many image captions don't end in periods;\n # consequently they end up in the body of the article as run-on sentences)\n lines = [fix_missing_period(line) for line in lines]\n\n # Separate out article and abstract sentences\n article_lines = []\n highlights = []\n next_is_highlight = False\n for idx, line in enumerate(lines):\n if line == \"\":\n continue # empty line\n elif line.startswith(\"@highlight\"):\n next_is_highlight = True\n elif next_is_highlight:\n highlights.append(line)\n else:\n article_lines.append(line)\n\n return article_lines, highlights\n\n\ndef write_to_tar(url_file, out_file, makevocab=False):\n \"\"\"Reads the tokenized .story files corresponding to the urls listed in the\n url_file and writes them to a out_file.\n \"\"\"\n print(\"Making bin file for URLs listed in {}...\".format(url_file))\n url_list = [line.strip() for line in open(url_file)]\n url_hashes = get_url_hashes(url_list)\n story_fnames = [s+\".story\" for s in url_hashes]\n num_stories = len(story_fnames)\n\n if makevocab:\n vocab_counter = collections.Counter()\n\n with tarfile.open(out_file, 'w') as writer:\n for idx, s in enumerate(story_fnames):\n if idx % 1000 == 0:\n print(\"Writing story {} of {}; {:.2f} percent done\".format(\n idx, num_stories, float(idx)*100.0/float(num_stories)))\n\n # Look in the tokenized story dirs to find the .story file\n # corresponding to this url\n if os.path.isfile(os.path.join(cnn_tokenized_stories_dir, s)):\n story_file = os.path.join(cnn_tokenized_stories_dir, s)\n elif os.path.isfile(os.path.join(dm_tokenized_stories_dir, s)):\n story_file = os.path.join(dm_tokenized_stories_dir, s)\n else:\n print(\"Error: Couldn't find tokenized story file {} in either\"\n \" tokenized story directories {} and {}. Was there an\"\n \" error during tokenization?\".format(\n s, cnn_tokenized_stories_dir,\n dm_tokenized_stories_dir))\n # Check again if tokenized stories directories contain correct\n # number of files\n print(\"Checking that the tokenized stories directories {}\"\n \" and {} contain correct number of files...\".format(\n cnn_tokenized_stories_dir, dm_tokenized_stories_dir))\n check_num_stories(cnn_tokenized_stories_dir,\n num_expected_cnn_stories)\n check_num_stories(dm_tokenized_stories_dir,\n num_expected_dm_stories)\n raise Exception(\n \"Tokenized stories directories {} and {}\"\n \" contain correct number of files but story\"\n \" file {} found in neither.\".format(\n cnn_tokenized_stories_dir,\n dm_tokenized_stories_dir, s)\n )\n\n # Get the strings to write to .bin file\n article_sents, abstract_sents = get_art_abs(story_file)\n\n # Write to JSON file\n js_example = {}\n js_example['id'] = s.replace('.story', '')\n js_example['article'] = article_sents\n js_example['abstract'] = abstract_sents\n js_serialized = json.dumps(js_example, indent=4).encode()\n save_file = io.BytesIO(js_serialized)\n tar_info = tarfile.TarInfo('{}/{}.json'.format(\n os.path.basename(out_file).replace('.tar', ''), idx))\n tar_info.size = len(js_serialized)\n writer.addfile(tar_info, save_file)\n\n # Write the vocab to file, if applicable\n if makevocab:\n art_tokens = ' '.join(article_sents).split()\n abs_tokens = ' '.join(abstract_sents).split()\n tokens = art_tokens + abs_tokens\n tokens = [t.strip() for t in tokens] # strip\n tokens = [t for t in tokens if t != \"\"] # remove empty\n vocab_counter.update(tokens)\n\n print(\"Finished writing file {}\\n\".format(out_file))\n\n # write vocab to file\n if makevocab:\n print(\"Writing vocab file...\")\n with open(os.path.join(finished_files_dir, \"vocab_cnt.pkl\"),\n 'wb') as vocab_file:\n pkl.dump(vocab_counter, vocab_file)\n print(\"Finished writing vocab file\")\n\n\ndef check_num_stories(stories_dir, num_expected):\n num_stories = len(os.listdir(stories_dir))\n if num_stories != num_expected:\n raise Exception(\n \"stories directory {} contains {} files\"\n \" but should contain {}\".format(\n stories_dir, num_stories, num_expected)\n )\n\n",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import tensorflow as tf\nfrom tensorflow.core.example import example_pb2\nimport struct,sys,nltk,json,io,tarfile\n\ndef write_to_json(articles, abstracts = None,out_file='custom_file.json'):\n \"\"\"article and abstract are list of string \n out_file : output file name \n \"\"\"\n tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n\n if abstracts is not None:\n if len(articles)==len(abstracts):\n print(\"equal length \")\n else:\n raise Exception(\"number of stories in article and abstract are different\")\n \n for idx,article in enumerate(articles): \n article_lines = tokenizer.tokenize(article) \n \n if abstracts is not None:\n abstract_lines = tokenizer.tokenize(abstracts[idx])\n #clean text after it\n # Make article into a single string\n article = ' '.join(article_lines)\n with open(os.path.join(\"data/cnn-dailymail/finished_files/val\",out_file.replace('.json','_'+str(idx)+'.json')), 'wb') as writer:\n\n # Write to tf.Example\n js_example = {}\n js_example['id'] = \" \"\n js_example['article'] = article_lines\n if abstracts is not None:\n js_example['abstract'] = abstract_lines\n else:\n js_example['abstract'] = \" \"\n js_serialized = json.dumps(js_example, indent=4).encode()\n writer.write(js_serialized)\n# save_file = io.BytesIO(js_serialized)\n# tar_info = tarfile.TarInfo('{}/{}.json'.format(\n# os.path.basename(out_file).replace('.tar', ''), idx))\n# tar_info.size = len(js_serialized)\n# writer.addfile(tar_info, save_file)\n\n",
"execution_count": 49,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from goose3 import Goose\nurl = 'https://indianexpress.com/article/india/pm-narendra-modis-tribute-atal-bihari-vajpayee-overcame-the-hesitation-of-our-nation-5310788/'\ng = Goose()\narticle = g.extract(url=url)\n\n# article.title\n\n# article.meta_description\n\ntext = article.cleaned_text",
"execution_count": 11,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "write_to_json([text])",
"execution_count": 50,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#!{sys.executable} decode_full_model.py --path=data/cnn-dailymail/finished_files/ --model_dir=model/pretrained/acl --beam=1 --val\n",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "!rm -r custom/output/",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "!{sys.executable} decode_full_model.py --path=custom/ --model_dir=model/pretrained/acl --beam=1 --val",
"execution_count": null,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "with open('custom/output/0.dec','rb') as r:\n summary = r.read()",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "print(summary.decode(\"utf-8\"))",
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"text": "the century, was gifted in spirit, heart .\nfor those of us who knew him, he was, the rarest of human beings, who touched and inspired everyone he met. the rarest of human beings, who touched and inspired\norator could switch from disarming humour to a lofty vision .\nhe was deeply respectful of others and gifted with a rare sense of humour that he often turned upon himself. .\nhe was compassionate to the core, generous in spirit, warm beyond measure and kind to a fault. warm beyond measure and kind to a fault. fault.\nthe jana sangh, organized the only truly national-level party to be formed in independent india — — and helmed deendayal upadhyaya. .\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "summary[0]",
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 10,
"data": {
"text/plain": "b'the century, was gifted in spirit, heart .\\n'"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "js_example['abstract'] = abstract_lines",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "ls data/cnn-dailymail/finished_files/",
"execution_count": 45,
"outputs": [
{
"output_type": "stream",
"text": "\u001b[0m\u001b[01;34mtest\u001b[0m/ \u001b[01;31mtest.tar\u001b[0m \u001b[01;34mtrain\u001b[0m/ \u001b[01;31mtrain.tar\u001b[0m \u001b[01;34mval\u001b[0m/ \u001b[01;34mval1\u001b[0m/ \u001b[01;31mval.tar\u001b[0m vocab_cnt.pkl\r\n",
"name": "stdout"
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"kernelspec": {
"name": "conda-env-fast_abs_rl-py",
"display_name": "Python [conda env:fast_abs_rl]",
"language": "python"
},
"latex_envs": {
"eqNumInitial": 1,
"eqLabelWithNumbers": true,
"current_citInitial": 1,
"cite_by": "apalike",
"bibliofile": "biblio.bib",
"LaTeX_envs_menu_present": true,
"labels_anchors": false,
"latex_user_defs": false,
"user_envs_cfg": false,
"report_style_numbering": false,
"autoclose": false,
"autocomplete": true,
"hotkeys": {
"equation": "Ctrl-E",
"itemize": "Ctrl-I"
}
},
"language_info": {
"name": "python",
"version": "3.6.4",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"gist": {
"id": "e0e2abb0b9f72fe8a8a85f9ab5bfcb49",
"data": {
"description": "ABG/text_related/fast_abs_rl/Untitled.ipynb",
"public": true
}
},
"_draft": {
"nbviewer_url": "https://gist.github.com/e0e2abb0b9f72fe8a8a85f9ab5bfcb49"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment