Skip to content

Instantly share code, notes, and snippets.

@fdovila
Last active December 24, 2021 20:10
Show Gist options
  • Save fdovila/6383a0dbd0f88053899defb27426f166 to your computer and use it in GitHub Desktop.
Save fdovila/6383a0dbd0f88053899defb27426f166 to your computer and use it in GitHub Desktop.
EN_summariseer_ensemble.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "EN_summariseer_ensemble.ipynb",
"provenance": [],
"machine_shape": "hm",
"private_outputs": true,
"collapsed_sections": [],
"authorship_tag": "ABX9TyPpvSPcwgynyGql0xyqf+CW",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "TPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/fdovila/6383a0dbd0f88053899defb27426f166/sumarizar_texto_ejemplo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"**Text summarisation ensembling 3 NLP transformers:**\n",
"* pegasus-xsum\n",
"* pegasus-large\n",
"* financial-summarization-pegasus\n",
"\n",
"\n",
"---\n",
"\n",
"\n",
"By F. Avila-Rencoret, MD, -2021-\n",
"\n"
],
"metadata": {
"id": "qK4UMc-460GI"
}
},
{
"cell_type": "code",
"source": [
"!pip install git+https://github.com/huggingface/transformers\n",
"from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
],
"metadata": {
"id": "gkYFqPwCsJ9p"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!pip install sentencepiece\n",
"!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash\n",
"!sudo apt-get install git-lfs"
],
"metadata": {
"id": "4SZkl9IaJL27"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!git clone https://huggingface.co/google/pegasus-xsum"
],
"metadata": {
"id": "RqH6npnEbpoH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!git clone https://huggingface.co/google/pegasus-large"
],
"metadata": {
"id": "uMIBt2Xrbp8a"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!git clone https://huggingface.co/human-centered-summarization/financial-summarization-pegasus"
],
"metadata": {
"id": "oi0N1ydgbqGZ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!huggingface-cli login"
],
"metadata": {
"id": "tD9qx8LMPSDJ"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!git config --global credential.helper store"
],
"metadata": {
"id": "x6yMkBGZaHJL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "PLEXWrC7qpPL"
},
"outputs": [],
"source": [
"import torch\n",
"from transformers import PegasusForConditionalGeneration, PegasusTokenizer\n",
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
"\n",
"\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"tokenizer1 = AutoTokenizer.from_pretrained(\"/content/pegasus-xsum\")\n",
"tokenizer2 = AutoTokenizer.from_pretrained(\"/content/pegasus-large\")\n",
"tokenizer3 = AutoTokenizer.from_pretrained(\"/content/financial-summarization-pegasus\")\n",
"\n",
"\n",
"model1 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/pegasus-xsum\").to(device)\n",
"model2 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/pegasus-large\").to(device)\n",
"model3 = AutoModelForSeq2SeqLM.from_pretrained(\"/content/financial-summarization-pegasus\").to(device)"
]
},
{
"cell_type": "code",
"source": [
"def generate_summary1(text):\n",
" #self.summ_len = summ_len\n",
" inputs = tokenizer1([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
" input_ids = inputs.input_ids.to(device)\n",
" attention_mask = inputs.attention_mask.to(device)\n",
" output = model1.generate(input_ids, attention_mask=attention_mask)\n",
" return tokenizer1.decode(output[0], skip_special_tokens=True)\n",
"\n",
"def generate_summary2(text):\n",
" #self.summ_len = summ_len\n",
" inputs = tokenizer2([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
" input_ids = inputs.input_ids.to(device)\n",
" attention_mask = inputs.attention_mask.to(device)\n",
" output = model2.generate(input_ids, attention_mask=attention_mask)\n",
" return tokenizer2.decode(output[0], skip_special_tokens=True)\n",
"\n",
"def generate_summary3(text):\n",
" #self.summ_len = summ_len\n",
" inputs = tokenizer3([text], padding=\"max_length\", truncation=True, max_length=250, return_tensors=\"pt\")\n",
" input_ids = inputs.input_ids.to(device)\n",
" attention_mask = inputs.attention_mask.to(device)\n",
" output = model3.generate(input_ids, attention_mask=attention_mask)\n",
" return tokenizer3.decode(output[0], skip_special_tokens=True)\n",
"\n",
"#Ejemplo con Cap 1 del Quijote\n",
"#text = \"PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.\"\n"
],
"metadata": {
"id": "nfD7bRFKiwxN"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"#@title Sentence to summarise\n",
"text = 'During my Master of Research and my PhD thesis, supervised by Prof DS Elson, Dr GP Mylonas and Prof A Darzi (Imperial College London), developed a robotic endoscopy framework for semi-automatic wide-field optical biopsy imaging in the gastrointestinal tract for the early detection of gastrointestinal cancers.' #@param {type:\"string\"}\n",
"print(text)"
],
"metadata": {
"id": "ZhL1QfbPjUfY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(text)\n",
"sentence1 = generate_summary1(text)\n",
"sentence2 = generate_summary2(text)\n",
"sentence3 = generate_summary3(text)\n"
],
"metadata": {
"id": "k7OVScn5jKSD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(sentence1,\"\\n\")\n",
"print(sentence2,\"\\n\")\n",
"print(sentence3,\"\\n\")\n",
"\n",
"text2 = sentence2 + sentence3\n",
"\n",
"print(text2,\"\\n\")\n",
"\n",
"sentence1a = generate_summary1(text2)\n",
"sentence2b = generate_summary2(text2)\n",
"sentence3c = generate_summary3(text2)"
],
"metadata": {
"id": "KKU0du9J1o2L"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(sentence1a)"
],
"metadata": {
"id": "B_Z5M9E4fWoa"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(sentence2b)"
],
"metadata": {
"id": "NCiWk_yHh-Wt"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"print(sentence3c)"
],
"metadata": {
"id": "Zt7-IDrviAP9"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment