fdovila/sumarizar_texto_ejemplo.ipynb

## sumarizar_texto_ejemplo.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "EN_summariseer_ensemble.ipynb",
      "provenance": [],
      "machine_shape": "hm",
      "private_outputs": true,
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyPpvSPcwgynyGql0xyqf+CW",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "TPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/fdovila/6383a0dbd0f88053899defb27426f166/sumarizar_texto_ejemplo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "**Text summarisation ensembling 3 NLP transformers:**\n",
        "*   pegasus-xsum\n",
        "*   pegasus-large\n",
        "*   financial-summarization-pegasus\n",
        "\n",
        "\n",
        "---\n",
        "\n",
        "\n",
        "By F. Avila-Rencoret, MD, -2021-\n",
        "\n"
      ],
      "metadata": {
        "id": "qK4UMc-460GI"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install git+https://github.com/huggingface/transformers\n",
        "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
      ],
      "metadata": {
        "id": "gkYFqPwCsJ9p"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install sentencepiece\n",
        "!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash\n",
        "!sudo apt-get install git-lfs"
      ],
      "metadata": {
        "id": "4SZkl9IaJL27"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://huggingface.co/google/pegasus-xsum"
      ],
      "metadata": {
        "id": "RqH6npnEbpoH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://huggingface.co/google/pegasus-large"
      ],
      "metadata": {
        "id": "uMIBt2Xrbp8a"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://huggingface.co/human-centered-summarization/financial-summarization-pegasus"
      ],
      "metadata": {
        "id": "oi0N1ydgbqGZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!huggingface-cli login"
      ],
      "metadata": {
        "id": "tD9qx8LMPSDJ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!git config --global credential.helper store"
      ],
      "metadata": {
        "id": "x6yMkBGZaHJL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "PLEXWrC7qpPL"
      },
      "outputs": [],
      "source": [
        "import torch\n",
        "from transformers import PegasusForConditionalGeneration, PegasusTokenizer\n",
        "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
        "\n",
        "\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "tokenizer1 = AutoTokenizer.from_pretrained(\"/content/pegasus-xsum\")\n",
        "tokenizer2 = AutoTokenizer.from_pretrained(\"/content/pegasus-large\")\n",
        "tokenizer3 = AutoTokenizer.from_pretrained(\"/content/financial-summarization-pegasus\")\n",
        "\n",
        "\n",
        "model1 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/pegasus-xsum\").to(device)\n",
        "model2 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/pegasus-large\").to(device)\n",
        "model3 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/financial-summarization-pegasus\").to(device)"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def generate_summary1(text):\n",
        "  #self.summ_len = summ_len\n",
        "  inputs = tokenizer1([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
        "  input_ids = inputs.input_ids.to(device)\n",
        "  attention_mask = inputs.attention_mask.to(device)\n",
        "  output = model1.generate(input_ids, attention_mask=attention_mask)\n",
        "  return tokenizer1.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "def generate_summary2(text):\n",
        "  #self.summ_len = summ_len\n",
        "  inputs = tokenizer2([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
        "  input_ids = inputs.input_ids.to(device)\n",
        "  attention_mask = inputs.attention_mask.to(device)\n",
        "  output = model2.generate(input_ids, attention_mask=attention_mask)\n",
        "  return tokenizer2.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "def generate_summary3(text):\n",
        "  #self.summ_len = summ_len\n",
        "  inputs = tokenizer3([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
        "  input_ids = inputs.input_ids.to(device)\n",
        "  attention_mask = inputs.attention_mask.to(device)\n",
        "  output = model3.generate(input_ids, attention_mask=attention_mask)\n",
        "  return tokenizer3.decode(output[0], skip_special_tokens=True)\n",
        "\n",
        "#Ejemplo con Cap 1 del Quijote\n",
        "#text = \"PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.\"\n"
      ],
      "metadata": {
        "id": "nfD7bRFKiwxN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Sentence to summarise\n",
        "text = 'During my Master of Research and my PhD thesis, supervised by Prof DS Elson, Dr GP Mylonas and Prof A Darzi (Imperial College London), developed a robotic endoscopy framework for semi-automatic wide-field optical biopsy imaging in the gastrointestinal tract for the early detection of gastrointestinal cancers.' #@param {type:\"string\"}\n",
        "print(text)"
      ],
      "metadata": {
        "id": "ZhL1QfbPjUfY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(text)\n",
        "sentence1 = generate_summary1(text)\n",
        "sentence2 = generate_summary2(text)\n",
        "sentence3 = generate_summary3(text)\n"
      ],
      "metadata": {
        "id": "k7OVScn5jKSD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(sentence1,\"\\n\")\n",
        "print(sentence2,\"\\n\")\n",
        "print(sentence3,\"\\n\")\n",
        "\n",
        "text2 = sentence2 + sentence3\n",
        "\n",
        "print(text2,\"\\n\")\n",
        "\n",
        "sentence1a = generate_summary1(text2)\n",
        "sentence2b = generate_summary2(text2)\n",
        "sentence3c = generate_summary3(text2)"
      ],
      "metadata": {
        "id": "KKU0du9J1o2L"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(sentence1a)"
      ],
      "metadata": {
        "id": "B_Z5M9E4fWoa"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(sentence2b)"
      ],
      "metadata": {
        "id": "NCiWk_yHh-Wt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(sentence3c)"
      ],
      "metadata": {
        "id": "Zt7-IDrviAP9"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "EN_summariseer_ensemble.ipynb",
	"provenance": [],
	"machine_shape": "hm",
	"private_outputs": true,
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyPpvSPcwgynyGql0xyqf+CW",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "TPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/fdovila/6383a0dbd0f88053899defb27426f166/sumarizar_texto_ejemplo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"Text summarisation ensembling 3 NLP transformers:\n",
	"* pegasus-xsum\n",
	"* pegasus-large\n",
	"* financial-summarization-pegasus\n",
	"\n",
	"\n",
	"---\n",
	"\n",
	"\n",
	"By F. Avila-Rencoret, MD, -2021-\n",
	"\n"
	],
	"metadata": {
	"id": "qK4UMc-460GI"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install git+https://github.com/huggingface/transformers\n",
	"from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
	],
	"metadata": {
	"id": "gkYFqPwCsJ9p"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install sentencepiece\n",
	"!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh \| sudo bash\n",
	"!sudo apt-get install git-lfs"
	],
	"metadata": {
	"id": "4SZkl9IaJL27"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://huggingface.co/google/pegasus-xsum"
	],
	"metadata": {
	"id": "RqH6npnEbpoH"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://huggingface.co/google/pegasus-large"
	],
	"metadata": {
	"id": "uMIBt2Xrbp8a"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://huggingface.co/human-centered-summarization/financial-summarization-pegasus"
	],
	"metadata": {
	"id": "oi0N1ydgbqGZ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!huggingface-cli login"
	],
	"metadata": {
	"id": "tD9qx8LMPSDJ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!git config --global credential.helper store"
	],
	"metadata": {
	"id": "x6yMkBGZaHJL"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "PLEXWrC7qpPL"
	},
	"outputs": [],
	"source": [
	"import torch\n",
	"from transformers import PegasusForConditionalGeneration, PegasusTokenizer\n",
	"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
	"\n",
	"\n",
	"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
	"tokenizer1 = AutoTokenizer.from_pretrained(\"/content/pegasus-xsum\")\n",
	"tokenizer2 = AutoTokenizer.from_pretrained(\"/content/pegasus-large\")\n",
	"tokenizer3 = AutoTokenizer.from_pretrained(\"/content/financial-summarization-pegasus\")\n",
	"\n",
	"\n",
	"model1 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/pegasus-xsum\").to(device)\n",
	"model2 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/pegasus-large\").to(device)\n",
	"model3 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/financial-summarization-pegasus\").to(device)"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"def generate_summary1(text):\n",
	" #self.summ_len = summ_len\n",
	" inputs = tokenizer1([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
	" input_ids = inputs.input_ids.to(device)\n",
	" attention_mask = inputs.attention_mask.to(device)\n",
	" output = model1.generate(input_ids, attention_mask=attention_mask)\n",
	" return tokenizer1.decode(output[0], skip_special_tokens=True)\n",
	"\n",
	"def generate_summary2(text):\n",
	" #self.summ_len = summ_len\n",
	" inputs = tokenizer2([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
	" input_ids = inputs.input_ids.to(device)\n",
	" attention_mask = inputs.attention_mask.to(device)\n",
	" output = model2.generate(input_ids, attention_mask=attention_mask)\n",
	" return tokenizer2.decode(output[0], skip_special_tokens=True)\n",
	"\n",
	"def generate_summary3(text):\n",
	" #self.summ_len = summ_len\n",
	" inputs = tokenizer3([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
	" input_ids = inputs.input_ids.to(device)\n",
	" attention_mask = inputs.attention_mask.to(device)\n",
	" output = model3.generate(input_ids, attention_mask=attention_mask)\n",
	" return tokenizer3.decode(output[0], skip_special_tokens=True)\n",
	"\n",
	"#Ejemplo con Cap 1 del Quijote\n",
	"#text = \"PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.\"\n"
	],
	"metadata": {
	"id": "nfD7bRFKiwxN"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"#@title Sentence to summarise\n",
	"text = 'During my Master of Research and my PhD thesis, supervised by Prof DS Elson, Dr GP Mylonas and Prof A Darzi (Imperial College London), developed a robotic endoscopy framework for semi-automatic wide-field optical biopsy imaging in the gastrointestinal tract for the early detection of gastrointestinal cancers.' #@param {type:\"string\"}\n",
	"print(text)"
	],
	"metadata": {
	"id": "ZhL1QfbPjUfY"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"print(text)\n",
	"sentence1 = generate_summary1(text)\n",
	"sentence2 = generate_summary2(text)\n",
	"sentence3 = generate_summary3(text)\n"
	],
	"metadata": {
	"id": "k7OVScn5jKSD"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"print(sentence1,\"\\n\")\n",
	"print(sentence2,\"\\n\")\n",
	"print(sentence3,\"\\n\")\n",
	"\n",
	"text2 = sentence2 + sentence3\n",
	"\n",
	"print(text2,\"\\n\")\n",
	"\n",
	"sentence1a = generate_summary1(text2)\n",
	"sentence2b = generate_summary2(text2)\n",
	"sentence3c = generate_summary3(text2)"
	],
	"metadata": {
	"id": "KKU0du9J1o2L"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"print(sentence1a)"
	],
	"metadata": {
	"id": "B_Z5M9E4fWoa"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"print(sentence2b)"
	],
	"metadata": {
	"id": "NCiWk_yHh-Wt"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"print(sentence3c)"
	],
	"metadata": {
	"id": "Zt7-IDrviAP9"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}