-
-
Save brockmanmatt/1b08f965cdfaab8820eb81657c39f494 to your computer and use it in GitHub Desktop.
testArticleLength.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "testArticleLength.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyN+llHKfIKmzU39lCy4jg3S", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/brockmanmatt/1b08f965cdfaab8820eb81657c39f494/testarticlelength.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "J7wnsgT2kPut", | |
"colab_type": "code", | |
"colab": { | |
"resources": { | |
"http://localhost:8080/nbextensions/google.colab/files.js": { | |
"data": "Ly8gQ29weXJpZ2h0IDIwMTcgR29vZ2xlIExMQwovLwovLyBMaWNlbnNlZCB1bmRlciB0aGUgQXBhY2hlIExpY2Vuc2UsIFZlcnNpb24gMi4wICh0aGUgIkxpY2Vuc2UiKTsKLy8geW91IG1heSBub3QgdXNlIHRoaXMgZmlsZSBleGNlcHQgaW4gY29tcGxpYW5jZSB3aXRoIHRoZSBMaWNlbnNlLgovLyBZb3UgbWF5IG9idGFpbiBhIGNvcHkgb2YgdGhlIExpY2Vuc2UgYXQKLy8KLy8gICAgICBodHRwOi8vd3d3LmFwYWNoZS5vcmcvbGljZW5zZXMvTElDRU5TRS0yLjAKLy8KLy8gVW5sZXNzIHJlcXVpcmVkIGJ5IGFwcGxpY2FibGUgbGF3IG9yIGFncmVlZCB0byBpbiB3cml0aW5nLCBzb2Z0d2FyZQovLyBkaXN0cmlidXRlZCB1bmRlciB0aGUgTGljZW5zZSBpcyBkaXN0cmlidXRlZCBvbiBhbiAiQVMgSVMiIEJBU0lTLAovLyBXSVRIT1VUIFdBUlJBTlRJRVMgT1IgQ09ORElUSU9OUyBPRiBBTlkgS0lORCwgZWl0aGVyIGV4cHJlc3Mgb3IgaW1wbGllZC4KLy8gU2VlIHRoZSBMaWNlbnNlIGZvciB0aGUgc3BlY2lmaWMgbGFuZ3VhZ2UgZ292ZXJuaW5nIHBlcm1pc3Npb25zIGFuZAovLyBsaW1pdGF0aW9ucyB1bmRlciB0aGUgTGljZW5zZS4KCi8qKgogKiBAZmlsZW92ZXJ2aWV3IEhlbHBlcnMgZm9yIGdvb2dsZS5jb2xhYiBQeXRob24gbW9kdWxlLgogKi8KKGZ1bmN0aW9uKHNjb3BlKSB7CmZ1bmN0aW9uIHNwYW4odGV4dCwgc3R5bGVBdHRyaWJ1dGVzID0ge30pIHsKICBjb25zdCBlbGVtZW50ID0gZG9jdW1lbnQuY3JlYXRlRWxlbWVudCgnc3BhbicpOwogIGVsZW1lbnQudGV4dENvbnRlbnQgPSB0ZXh0OwogIGZvciAoY29uc3Qga2V5IG9mIE9iamVjdC5rZXlzKHN0eWxlQXR0cmlidXRlcykpIHsKICAgIGVsZW1lbnQuc3R5bGVba2V5XSA9IHN0eWxlQXR0cmlidXRlc1trZXldOwogIH0KICByZXR1cm4gZWxlbWVudDsKfQoKLy8gTWF4IG51bWJlciBvZiBieXRlcyB3aGljaCB3aWxsIGJlIHVwbG9hZGVkIGF0IGEgdGltZS4KY29uc3QgTUFYX1BBWUxPQURfU0laRSA9IDEwMCAqIDEwMjQ7CgpmdW5jdGlvbiBfdXBsb2FkRmlsZXMoaW5wdXRJZCwgb3V0cHV0SWQpIHsKICBjb25zdCBzdGVwcyA9IHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCk7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICAvLyBDYWNoZSBzdGVwcyBvbiB0aGUgb3V0cHV0RWxlbWVudCB0byBtYWtlIGl0IGF2YWlsYWJsZSBmb3IgdGhlIG5leHQgY2FsbAogIC8vIHRvIHVwbG9hZEZpbGVzQ29udGludWUgZnJvbSBQeXRob24uCiAgb3V0cHV0RWxlbWVudC5zdGVwcyA9IHN0ZXBzOwoKICByZXR1cm4gX3VwbG9hZEZpbGVzQ29udGludWUob3V0cHV0SWQpOwp9CgovLyBUaGlzIGlzIHJvdWdobHkgYW4gYXN5bmMgZ2VuZXJhdG9yIChub3Qgc3VwcG9ydGVkIGluIHRoZSBicm93c2VyIHlldCksCi8vIHdoZXJlIHRoZXJlIGFyZSBtdWx0aXBsZSBhc3luY2hyb25vdXMgc3RlcHMgYW5kIHRoZSBQeXRob24gc2lkZSBpcyBnb2luZwovLyB0byBwb2xsIGZvciBjb21wbGV0aW9uIG9mIGVhY2ggc3RlcC4KLy8gVGhpcyB1c2VzIGEgUHJvbWlzZSB0byBibG9jayB0aGUgcHl0aG9uIHNpZGUgb24gY29tcGxldGlvbiBvZiBlYWNoIHN0ZXAsCi8vIHRoZW4gcGFzc2VzIHRoZSByZXN1bHQgb2YgdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIGlucHV0IHRvIHRoZSBuZXh0IHN0ZXAuCmZ1bmN0aW9uIF91cGxvYWRGaWxlc0NvbnRpbnVlKG91dHB1dElkKSB7CiAgY29uc3Qgb3V0cHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKG91dHB1dElkKTsKICBjb25zdCBzdGVwcyA9IG91dHB1dEVsZW1lbnQuc3RlcHM7CgogIGNvbnN0IG5leHQgPSBzdGVwcy5uZXh0KG91dHB1dEVsZW1lbnQubGFzdFByb21pc2VWYWx1ZSk7CiAgcmV0dXJuIFByb21pc2UucmVzb2x2ZShuZXh0LnZhbHVlLnByb21pc2UpLnRoZW4oKHZhbHVlKSA9PiB7CiAgICAvLyBDYWNoZSB0aGUgbGFzdCBwcm9taXNlIHZhbHVlIHRvIG1ha2UgaXQgYXZhaWxhYmxlIHRvIHRoZSBuZXh0CiAgICAvLyBzdGVwIG9mIHRoZSBnZW5lcmF0b3IuCiAgICBvdXRwdXRFbGVtZW50Lmxhc3RQcm9taXNlVmFsdWUgPSB2YWx1ZTsKICAgIHJldHVybiBuZXh0LnZhbHVlLnJlc3BvbnNlOwogIH0pOwp9CgovKioKICogR2VuZXJhdG9yIGZ1bmN0aW9uIHdoaWNoIGlzIGNhbGxlZCBiZXR3ZWVuIGVhY2ggYXN5bmMgc3RlcCBvZiB0aGUgdXBsb2FkCiAqIHByb2Nlc3MuCiAqIEBwYXJhbSB7c3RyaW5nfSBpbnB1dElkIEVsZW1lbnQgSUQgb2YgdGhlIGlucHV0IGZpbGUgcGlja2VyIGVsZW1lbnQuCiAqIEBwYXJhbSB7c3RyaW5nfSBvdXRwdXRJZCBFbGVtZW50IElEIG9mIHRoZSBvdXRwdXQgZGlzcGxheS4KICogQHJldHVybiB7IUl0ZXJhYmxlPCFPYmplY3Q+fSBJdGVyYWJsZSBvZiBuZXh0IHN0ZXBzLgogKi8KZnVuY3Rpb24qIHVwbG9hZEZpbGVzU3RlcChpbnB1dElkLCBvdXRwdXRJZCkgewogIGNvbnN0IGlucHV0RWxlbWVudCA9IGRvY3VtZW50LmdldEVsZW1lbnRCeUlkKGlucHV0SWQpOwogIGlucHV0RWxlbWVudC5kaXNhYmxlZCA9IGZhbHNlOwoKICBjb25zdCBvdXRwdXRFbGVtZW50ID0gZG9jdW1lbnQuZ2V0RWxlbWVudEJ5SWQob3V0cHV0SWQpOwogIG91dHB1dEVsZW1lbnQuaW5uZXJIVE1MID0gJyc7CgogIGNvbnN0IHBpY2tlZFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgaW5wdXRFbGVtZW50LmFkZEV2ZW50TGlzdGVuZXIoJ2NoYW5nZScsIChlKSA9PiB7CiAgICAgIHJlc29sdmUoZS50YXJnZXQuZmlsZXMpOwogICAgfSk7CiAgfSk7CgogIGNvbnN0IGNhbmNlbCA9IGRvY3VtZW50LmNyZWF0ZUVsZW1lbnQoJ2J1dHRvbicpOwogIGlucHV0RWxlbWVudC5wYXJlbnRFbGVtZW50LmFwcGVuZENoaWxkKGNhbmNlbCk7CiAgY2FuY2VsLnRleHRDb250ZW50ID0gJ0NhbmNlbCB1cGxvYWQnOwogIGNvbnN0IGNhbmNlbFByb21pc2UgPSBuZXcgUHJvbWlzZSgocmVzb2x2ZSkgPT4gewogICAgY2FuY2VsLm9uY2xpY2sgPSAoKSA9PiB7CiAgICAgIHJlc29sdmUobnVsbCk7CiAgICB9OwogIH0pOwoKICAvLyBXYWl0IGZvciB0aGUgdXNlciB0byBwaWNrIHRoZSBmaWxlcy4KICBjb25zdCBmaWxlcyA9IHlpZWxkIHsKICAgIHByb21pc2U6IFByb21pc2UucmFjZShbcGlja2VkUHJvbWlzZSwgY2FuY2VsUHJvbWlzZV0pLAogICAgcmVzcG9uc2U6IHsKICAgICAgYWN0aW9uOiAnc3RhcnRpbmcnLAogICAgfQogIH07CgogIGNhbmNlbC5yZW1vdmUoKTsKCiAgLy8gRGlzYWJsZSB0aGUgaW5wdXQgZWxlbWVudCBzaW5jZSBmdXJ0aGVyIHBpY2tzIGFyZSBub3QgYWxsb3dlZC4KICBpbnB1dEVsZW1lbnQuZGlzYWJsZWQgPSB0cnVlOwoKICBpZiAoIWZpbGVzKSB7CiAgICByZXR1cm4gewogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbXBsZXRlJywKICAgICAgfQogICAgfTsKICB9CgogIGZvciAoY29uc3QgZmlsZSBvZiBmaWxlcykgewogICAgY29uc3QgbGkgPSBkb2N1bWVudC5jcmVhdGVFbGVtZW50KCdsaScpOwogICAgbGkuYXBwZW5kKHNwYW4oZmlsZS5uYW1lLCB7Zm9udFdlaWdodDogJ2JvbGQnfSkpOwogICAgbGkuYXBwZW5kKHNwYW4oCiAgICAgICAgYCgke2ZpbGUudHlwZSB8fCAnbi9hJ30pIC0gJHtmaWxlLnNpemV9IGJ5dGVzLCBgICsKICAgICAgICBgbGFzdCBtb2RpZmllZDogJHsKICAgICAgICAgICAgZmlsZS5sYXN0TW9kaWZpZWREYXRlID8gZmlsZS5sYXN0TW9kaWZpZWREYXRlLnRvTG9jYWxlRGF0ZVN0cmluZygpIDoKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ24vYSd9IC0gYCkpOwogICAgY29uc3QgcGVyY2VudCA9IHNwYW4oJzAlIGRvbmUnKTsKICAgIGxpLmFwcGVuZENoaWxkKHBlcmNlbnQpOwoKICAgIG91dHB1dEVsZW1lbnQuYXBwZW5kQ2hpbGQobGkpOwoKICAgIGNvbnN0IGZpbGVEYXRhUHJvbWlzZSA9IG5ldyBQcm9taXNlKChyZXNvbHZlKSA9PiB7CiAgICAgIGNvbnN0IHJlYWRlciA9IG5ldyBGaWxlUmVhZGVyKCk7CiAgICAgIHJlYWRlci5vbmxvYWQgPSAoZSkgPT4gewogICAgICAgIHJlc29sdmUoZS50YXJnZXQucmVzdWx0KTsKICAgICAgfTsKICAgICAgcmVhZGVyLnJlYWRBc0FycmF5QnVmZmVyKGZpbGUpOwogICAgfSk7CiAgICAvLyBXYWl0IGZvciB0aGUgZGF0YSB0byBiZSByZWFkeS4KICAgIGxldCBmaWxlRGF0YSA9IHlpZWxkIHsKICAgICAgcHJvbWlzZTogZmlsZURhdGFQcm9taXNlLAogICAgICByZXNwb25zZTogewogICAgICAgIGFjdGlvbjogJ2NvbnRpbnVlJywKICAgICAgfQogICAgfTsKCiAgICAvLyBVc2UgYSBjaHVua2VkIHNlbmRpbmcgdG8gYXZvaWQgbWVzc2FnZSBzaXplIGxpbWl0cy4gU2VlIGIvNjIxMTU2NjAuCiAgICBsZXQgcG9zaXRpb24gPSAwOwogICAgd2hpbGUgKHBvc2l0aW9uIDwgZmlsZURhdGEuYnl0ZUxlbmd0aCkgewogICAgICBjb25zdCBsZW5ndGggPSBNYXRoLm1pbihmaWxlRGF0YS5ieXRlTGVuZ3RoIC0gcG9zaXRpb24sIE1BWF9QQVlMT0FEX1NJWkUpOwogICAgICBjb25zdCBjaHVuayA9IG5ldyBVaW50OEFycmF5KGZpbGVEYXRhLCBwb3NpdGlvbiwgbGVuZ3RoKTsKICAgICAgcG9zaXRpb24gKz0gbGVuZ3RoOwoKICAgICAgY29uc3QgYmFzZTY0ID0gYnRvYShTdHJpbmcuZnJvbUNoYXJDb2RlLmFwcGx5KG51bGwsIGNodW5rKSk7CiAgICAgIHlpZWxkIHsKICAgICAgICByZXNwb25zZTogewogICAgICAgICAgYWN0aW9uOiAnYXBwZW5kJywKICAgICAgICAgIGZpbGU6IGZpbGUubmFtZSwKICAgICAgICAgIGRhdGE6IGJhc2U2NCwKICAgICAgICB9LAogICAgICB9OwogICAgICBwZXJjZW50LnRleHRDb250ZW50ID0KICAgICAgICAgIGAke01hdGgucm91bmQoKHBvc2l0aW9uIC8gZmlsZURhdGEuYnl0ZUxlbmd0aCkgKiAxMDApfSUgZG9uZWA7CiAgICB9CiAgfQoKICAvLyBBbGwgZG9uZS4KICB5aWVsZCB7CiAgICByZXNwb25zZTogewogICAgICBhY3Rpb246ICdjb21wbGV0ZScsCiAgICB9CiAgfTsKfQoKc2NvcGUuZ29vZ2xlID0gc2NvcGUuZ29vZ2xlIHx8IHt9OwpzY29wZS5nb29nbGUuY29sYWIgPSBzY29wZS5nb29nbGUuY29sYWIgfHwge307CnNjb3BlLmdvb2dsZS5jb2xhYi5fZmlsZXMgPSB7CiAgX3VwbG9hZEZpbGVzLAogIF91cGxvYWRGaWxlc0NvbnRpbnVlLAp9Owp9KShzZWxmKTsK", | |
"ok": true, | |
"headers": [ | |
[ | |
"content-type", | |
"application/javascript" | |
] | |
], | |
"status": 200, | |
"status_text": "" | |
} | |
}, | |
"base_uri": "https://localhost:8080/", | |
"height": 89 | |
}, | |
"outputId": "cccf6ca1-bab4-44d1-f1eb-48b63f1b33b1" | |
}, | |
"source": [ | |
"from google.colab import files\n", | |
"uploaded = files.upload()\n", | |
"print(\"done\")" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/html": [ | |
"\n", | |
" <input type=\"file\" id=\"files-daf89009-84de-4fd7-a694-e88cc598ad2a\" name=\"files[]\" multiple disabled\n", | |
" style=\"border:none\" />\n", | |
" <output id=\"result-daf89009-84de-4fd7-a694-e88cc598ad2a\">\n", | |
" Upload widget is only available when the cell has been executed in the\n", | |
" current browser session. Please rerun this cell to enable.\n", | |
" </output>\n", | |
" <script src=\"/nbextensions/google.colab/files.js\"></script> " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Saving key.json to key.json\n", | |
"done\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "WHPHrUnhpKnI", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"I'll install the API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "zq0ltp2xn4yt", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 139 | |
}, | |
"outputId": "e4a549c3-e66e-4835-a353-21268b7889b4" | |
}, | |
"source": [ | |
"!pip install openai\n", | |
"import openai, json, pandas as pd, numpy as np" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: openai in /usr/local/lib/python3.6/dist-packages (0.2.4)\n", | |
"Requirement already satisfied: requests>=2.20; python_version >= \"3.0\" in /usr/local/lib/python3.6/dist-packages (from openai) (2.23.0)\n", | |
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.20; python_version >= \"3.0\"->openai) (1.24.3)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.20; python_version >= \"3.0\"->openai) (2020.6.20)\n", | |
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.20; python_version >= \"3.0\"->openai) (2.10)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.20; python_version >= \"3.0\"->openai) (3.0.4)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "Q2yE0jcnpMEV", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Loading in key.json that I uploaded; I do this so I don't need to worry about accidently leaking creds if I share the colab (which I'm 99% sure is just a json file that won't expose them)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bwNXXwHen5x9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"openai.api_key = json.load(open(\"key.json\", \"r\"))[\"key\"]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "k67w5H0fpTkT", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Default keyword arguments to pass the aPI" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "e1EwpqqJkTYh", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"#arguments to send the API\n", | |
"kwargs = {\n", | |
"\"engine\":\"davinci\",\n", | |
"\"temperature\":0,\n", | |
"\"max_tokens\":150,\n", | |
"\"stop\":\"\\n\",\n", | |
"}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zZubgPoOpWDH", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Quick wrapper to automatically save prompts and responses sent for later analysis if needed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "sXTDJx0An9Bl", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"def query(prompt, myKwargs = kwargs):\n", | |
" \"\"\"\n", | |
" wrapper for the API\n", | |
" \"\"\"\n", | |
" r = openai.Completion.create(prompt=prompt, **myKwargs)[\"choices\"][0][\"text\"].strip()\n", | |
" return r" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "EdFXafcJpZ3Q", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"Test to make sure my query works" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4SlyKgjyopPn", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "7b3ac2ba-dc2f-4af5-eec0-7c1713d74276" | |
}, | |
"source": [ | |
"newKwargs = kwargs.copy()\n", | |
"newKwargs[\"stop\"] = \"\\n\"\n", | |
"query(\"q: what is 1+1?\\na:\", newKwargs)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'2'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6Fmrw3xEyHB9", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"cnn = pd.read_csv(\"https://raw.githubusercontent.com/brockmanmatt/CoverageTrends/master/archived_links/cnn/202007/cnn_20200726.csv\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "tNMKdXWsyVCC", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"fox = pd.read_csv(\"https://raw.githubusercontent.com/brockmanmatt/CoverageTrends/master/archived_links/foxnews/202007/foxnews_20200726.csv\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "n-pYPL3fgGW5", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"nytimes = pd.read_csv(\"https://raw.githubusercontent.com/brockmanmatt/CoverageTrends/master/archived_links/newyorktimes/202007/newyorktimes_20200726.csv\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GDmICW_5gQ0e", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"washingtonpost = pd.read_csv(\"https://raw.githubusercontent.com/brockmanmatt/CoverageTrends/master/archived_links/washingtonpost/202007/washingtonpost_20200726.csv\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-4s62vkIyYa-", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"cnn = cnn.drop_duplicates(subset=\"text\").drop_duplicates(subset=\"Unnamed: 0\")\n", | |
"fox = fox.drop_duplicates(subset=\"text\").drop_duplicates(subset=\"Unnamed: 0\")\n", | |
"nytimes = nytimes.drop_duplicates(subset=\"text\").drop_duplicates(subset=\"Unnamed: 0\")\n", | |
"washingtonpost = washingtonpost.drop_duplicates(subset=\"text\").drop_duplicates(subset=\"Unnamed: 0\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ZATEfOnryx4P", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"cnn[\"org\"] = \"cnn.com\"\n", | |
"fox[\"org\"] = \"foxnews.com\"\n", | |
"nytimes[\"org\"] = \"nytimes.com\"\n", | |
"washingtonpost[\"org\"] = \"washingtonpost.com\"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "iCd-Ewgx4tps", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"cnn[\"url\"] = cnn[\"Unnamed: 0\"].apply(lambda x: x if x.startswith(\"h\") else \"https://cnn.com\"+x)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "HcCAVS3EmXyc", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"nytimes[\"url\"] = nytimes[\"Unnamed: 0\"].apply(lambda x: x if x.startswith(\"h\") else \"https://nytimes.com\"+x)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jDO8U7y0oWj8", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"fox[\"url\"] = fox[\"Unnamed: 0\"]\n", | |
"washingtonpost[\"url\"] = washingtonpost[\"Unnamed: 0\"]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "s9VaJwP98hOP", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"all_arts = pd.concat([cnn, fox, nytimes, washingtonpost])" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "igAt7zANmqs5", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"all_arts.reset_index(inplace=True, drop=True)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "P65mdQrqmuh2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"all_arts = all_arts.sample(len(all_arts))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "fDw3KJdq9lBf", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 394 | |
}, | |
"outputId": "4326b23e-2396-4c69-ed9e-ef05ab3aa82f" | |
}, | |
"source": [ | |
"!pip3 install newspaper3k" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Requirement already satisfied: newspaper3k in /usr/local/lib/python3.6/dist-packages (0.2.8)\n", | |
"Requirement already satisfied: lxml>=3.6.0 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (4.2.6)\n", | |
"Requirement already satisfied: jieba3k>=0.35.1 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (0.35.1)\n", | |
"Requirement already satisfied: PyYAML>=3.11 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (3.13)\n", | |
"Requirement already satisfied: feedfinder2>=0.0.4 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (0.0.4)\n", | |
"Requirement already satisfied: tinysegmenter==0.3 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (0.3)\n", | |
"Requirement already satisfied: feedparser>=5.2.1 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (5.2.1)\n", | |
"Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (4.6.3)\n", | |
"Requirement already satisfied: nltk>=3.2.1 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (3.2.5)\n", | |
"Requirement already satisfied: tldextract>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (2.2.2)\n", | |
"Requirement already satisfied: cssselect>=0.9.2 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (1.1.0)\n", | |
"Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (2.8.1)\n", | |
"Requirement already satisfied: Pillow>=3.3.0 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (7.0.0)\n", | |
"Requirement already satisfied: requests>=2.10.0 in /usr/local/lib/python3.6/dist-packages (from newspaper3k) (2.23.0)\n", | |
"Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from feedfinder2>=0.0.4->newspaper3k) (1.15.0)\n", | |
"Requirement already satisfied: idna in /usr/local/lib/python3.6/dist-packages (from tldextract>=2.0.1->newspaper3k) (2.10)\n", | |
"Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from tldextract>=2.0.1->newspaper3k) (49.1.0)\n", | |
"Requirement already satisfied: requests-file>=1.4 in /usr/local/lib/python3.6/dist-packages (from tldextract>=2.0.1->newspaper3k) (1.5.1)\n", | |
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->newspaper3k) (2020.6.20)\n", | |
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->newspaper3k) (1.24.3)\n", | |
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.10.0->newspaper3k) (3.0.4)\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "JPcX8X86-gbp", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"from newspaper import fulltext\n", | |
"import requests" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "q2_biwnn-oT_", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "432941e9-47dc-4ecc-b47d-2a6d55981e14" | |
}, | |
"source": [ | |
"len(all_arts)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"751" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 19 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1L0is75MnBr_", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"texts = {}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AxA28szPEzNQ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"idx=0\n", | |
"for url in all_arts.url:\n", | |
" idx += 1\n", | |
" \n", | |
" print(\"{}: {}/{}\".format(url, idx, len(all_arts.url)))\n", | |
" \n", | |
" if url in texts:\n", | |
" continue\n", | |
" try:\n", | |
" texts[url] = fulltext(requests.get(url).text)\n", | |
" except KeyboardInterrupt:\n", | |
" raise\n", | |
" except:\n", | |
" texts[url] = \"\"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7qYa1FV_oRwi", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"art_df = pd.DataFrame([texts]).T" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Osh0x1u70Sri", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"art_df.to_csv(\"20170727_articles.csv\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "t8_L1OlD0SuG", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "472aba8a-bbdc-4111-9703-613bef7296f9" | |
}, | |
"source": [ | |
"from google.colab import files\n", | |
"files.download('20170727_articles.csv') \n" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" async function download(id, filename, size) {\n", | |
" if (!google.colab.kernel.accessAllowed) {\n", | |
" return;\n", | |
" }\n", | |
" const div = document.createElement('div');\n", | |
" const label = document.createElement('label');\n", | |
" label.textContent = `Downloading \"${filename}\": `;\n", | |
" div.appendChild(label);\n", | |
" const progress = document.createElement('progress');\n", | |
" progress.max = size;\n", | |
" div.appendChild(progress);\n", | |
" document.body.appendChild(div);\n", | |
"\n", | |
" const buffers = [];\n", | |
" let downloaded = 0;\n", | |
"\n", | |
" const channel = await google.colab.kernel.comms.open(id);\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
"\n", | |
" for await (const message of channel.messages) {\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
" if (message.buffers) {\n", | |
" for (const buffer of message.buffers) {\n", | |
" buffers.push(buffer);\n", | |
" downloaded += buffer.byteLength;\n", | |
" progress.value = downloaded;\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
" const a = document.createElement('a');\n", | |
" a.href = window.URL.createObjectURL(blob);\n", | |
" a.download = filename;\n", | |
" div.appendChild(a);\n", | |
" a.click();\n", | |
" div.remove();\n", | |
" }\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"download(\"download_18f5b57d-e933-49cb-a8ee-03cdbbc7ecce\", \"20170727_articles.csv\", 3338280)" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wXo6eai6COLI", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"art_df.columns = [\"text\"]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xIY6RMIG6f4Q", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"art_df[\"length\"] = art_df.text.apply(lambda x: len(x))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vaIdFpXZCUhT", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"art_df = art_df[art_df.length > 0]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "AVVa54vbCZRc", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"K, down to 700 articles, not doing large in this one so we'll stop at 1500 characters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "TC0L4_laCXIp", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"art_df = art_df[(art_df.length > 100) & (art_df.length < 1500)]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "S1RCL4NvCmME", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"K, now we're down to a reasonable 168 artices" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "saUbOhCZCXwy", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 589 | |
}, | |
"outputId": "4a0621a9-8076-4c12-9e4a-3c3fd8979f30" | |
}, | |
"source": [ | |
"art_df" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>text</th>\n", | |
" <th>length</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/2020/07/25/us/huge-black-bear-in-pool-trnd/index.html</th>\n", | |
" <td>(CNN) A woman in Virginia was delighted when a...</td>\n", | |
" <td>593</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/2020/07/26/weather/hurricane-douglas-forecast-hawaii-sunday/index.html</th>\n", | |
" <td>(CNN) Douglas could become only the third hurr...</td>\n", | |
" <td>820</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/videos/politics/2020/07/26/john-lewis-procession-edmund-pettus-bridge-vpx-rs.cnn</th>\n", | |
" <td>The body of Rep. John Lewis crosses the Edmund...</td>\n", | |
" <td>163</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/videos/health/2020/07/25/texas-doctor-donnelly-intv-coronavirus-devastating-parking-lot-patient-sot-ac360-vpx.cnn</th>\n", | |
" <td>Texas ER physician Dr. Joseph Donnelly describ...</td>\n", | |
" <td>231</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://nytimes.com/interactive/2020/07/24/briefing/federal-agents-john-lewis-mars-news-quiz.html</th>\n", | |
" <td>1 of 11\\n\\nFederal agents are patrolling Portl...</td>\n", | |
" <td>149</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/2020/07/23/health/covid-mask-layers-wellness/index.html</th>\n", | |
" <td>(CNN) Home-made cloth face masks likely need a...</td>\n", | |
" <td>1116</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/2020/07/26/politics/john-lewis-voting-rights-bill-rename/index.html</th>\n", | |
" <td>Washington (CNN) House Majority Whip Jim Clybu...</td>\n", | |
" <td>1092</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://www.foxnews.com/us/chicago-nurse-brawls-with-train-passenger-who-ranted-about-coronavirus-video</th>\n", | |
" <td>Video captured on the Chicago Transit Authorit...</td>\n", | |
" <td>1116</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://itunes.apple.com/us/podcast/id1444873564?mt=2&at=1001lvyS&ct=pr_wp_showpage</th>\n", | |
" <td>RECurley ,\\n\\nGreat show, but I wish the media...</td>\n", | |
" <td>1054</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>https://cnn.com/videos/us/2020/07/08/all-lives-matter-offensive-problematic-eg-orig.cnn</th>\n", | |
" <td>Whether it's intentional or not, saying those ...</td>\n", | |
" <td>205</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>168 rows × 2 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" text length\n", | |
"https://cnn.com/2020/07/25/us/huge-black-bear-i... (CNN) A woman in Virginia was delighted when a... 593\n", | |
"https://cnn.com/2020/07/26/weather/hurricane-do... (CNN) Douglas could become only the third hurr... 820\n", | |
"https://cnn.com/videos/politics/2020/07/26/john... The body of Rep. John Lewis crosses the Edmund... 163\n", | |
"https://cnn.com/videos/health/2020/07/25/texas-... Texas ER physician Dr. Joseph Donnelly describ... 231\n", | |
"https://nytimes.com/interactive/2020/07/24/brie... 1 of 11\\n\\nFederal agents are patrolling Portl... 149\n", | |
"... ... ...\n", | |
"https://cnn.com/2020/07/23/health/covid-mask-la... (CNN) Home-made cloth face masks likely need a... 1116\n", | |
"https://cnn.com/2020/07/26/politics/john-lewis-... Washington (CNN) House Majority Whip Jim Clybu... 1092\n", | |
"https://www.foxnews.com/us/chicago-nurse-brawls... Video captured on the Chicago Transit Authorit... 1116\n", | |
"https://itunes.apple.com/us/podcast/id144487356... RECurley ,\\n\\nGreat show, but I wish the media... 1054\n", | |
"https://cnn.com/videos/us/2020/07/08/all-lives-... Whether it's intentional or not, saying those ... 205\n", | |
"\n", | |
"[168 rows x 2 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 57 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "qvNuIqq0C5Ga", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 581 | |
}, | |
"outputId": "7d94243e-3083-4582-b6f8-a5246e19824e" | |
}, | |
"source": [ | |
"all_arts.head()" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Unnamed: 0</th>\n", | |
" <th>text</th>\n", | |
" <th>notes</th>\n", | |
" <th>date</th>\n", | |
" <th>org</th>\n", | |
" <th>url</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>431</th>\n", | |
" <td>/2020/07/23/podcasts/nice-white-parents-serial...</td>\n", | |
" <td>Introducing: ‘Nice White Parents’A podcast on ...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20200726-0000</td>\n", | |
" <td>nytimes.com</td>\n", | |
" <td>https://nytimes.com/2020/07/23/podcasts/nice-w...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>155</th>\n", | |
" <td>/2020/07/26/politics/brett-giroir-testing-time...</td>\n", | |
" <td>Giroir says coronavirus testing turnaround tim...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20200726-1430</td>\n", | |
" <td>cnn.com</td>\n", | |
" <td>https://cnn.com/2020/07/26/politics/brett-giro...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>425</th>\n", | |
" <td>https://www.foxnews.com/sports/astros-verlande...</td>\n", | |
" <td>Astros' Verlander out at least two weeks with ...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20200726-2300</td>\n", | |
" <td>foxnews.com</td>\n", | |
" <td>https://www.foxnews.com/sports/astros-verlande...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>224</th>\n", | |
" <td>https://www.foxnews.com/politics/ex-spy-chris-...</td>\n", | |
" <td>Ex-spy Christopher Steele surfaces after FISA ...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20200726-0000</td>\n", | |
" <td>foxnews.com</td>\n", | |
" <td>https://www.foxnews.com/politics/ex-spy-chris-...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>77</th>\n", | |
" <td>/2020/07/25/sport/alexi-pappas-olympics-marath...</td>\n", | |
" <td>'It's OK to not feel great': Mom's suicide lef...</td>\n", | |
" <td>NaN</td>\n", | |
" <td>20200726-0000</td>\n", | |
" <td>cnn.com</td>\n", | |
" <td>https://cnn.com/2020/07/25/sport/alexi-pappas-...</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Unnamed: 0 ... url\n", | |
"431 /2020/07/23/podcasts/nice-white-parents-serial... ... https://nytimes.com/2020/07/23/podcasts/nice-w...\n", | |
"155 /2020/07/26/politics/brett-giroir-testing-time... ... https://cnn.com/2020/07/26/politics/brett-giro...\n", | |
"425 https://www.foxnews.com/sports/astros-verlande... ... https://www.foxnews.com/sports/astros-verlande...\n", | |
"224 https://www.foxnews.com/politics/ex-spy-chris-... ... https://www.foxnews.com/politics/ex-spy-chris-...\n", | |
"77 /2020/07/25/sport/alexi-pappas-olympics-marath... ... https://cnn.com/2020/07/25/sport/alexi-pappas-...\n", | |
"\n", | |
"[5 rows x 6 columns]" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 58 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-4Vgo7zdChWg", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"df = pd.concat([art_df, all_arts.set_index(\"url\")[[\"org\"]]], axis=1, join='inner').reset_index().rename(columns={\"index\":\"url\"})" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eNkgXt9HD7Ha", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 102 | |
}, | |
"outputId": "d3a06881-6084-43fe-a337-f5623500e54c" | |
}, | |
"source": [ | |
"df.org.value_counts()" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"cnn.com 83\n", | |
"washingtonpost.com 34\n", | |
"foxnews.com 26\n", | |
"nytimes.com 25\n", | |
"Name: org, dtype: int64" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 76 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "nHIZJpucEruu", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts = {}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1PyGaFnqForY", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab_type": "code", | |
"id": "dCU-KVkGFoyB", | |
"colab": {} | |
}, | |
"source": [ | |
"#arguments to send the API\n", | |
"kwargs = {\n", | |
"\"engine\":\"davinci\",\n", | |
"\"temperature\":0,\n", | |
"\"max_tokens\":300,\n", | |
"\"stop\":\"\\n\",\n", | |
"}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"colab_type": "text", | |
"id": "OrnglBl0FoyF" | |
}, | |
"source": [ | |
"Quick wrapper to automatically save prompts and responses sent for later analysis if needed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab_type": "code", | |
"id": "EN8x7mDUFoyF", | |
"colab": {} | |
}, | |
"source": [ | |
"def query(prompt, myKwargs = kwargs):\n", | |
" \"\"\"\n", | |
" wrapper for the API\n", | |
" \"\"\"\n", | |
" r = openai.Completion.create(prompt=prompt, **myKwargs)[\"choices\"][0][\"text\"].strip()\n", | |
" return r" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "LMMaDiZ4Fb66", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"\\nThis boils down to\"] = {\"prompt\":\"\"\"{}\\nThis boils down to the simple idea that\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jyMsn_ieE1eg", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"\\n\\nThis boils down to\"] = {\"prompt\":\"\"\"{}\\n\\nThis boils down to the simple idea that\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "z00blx2jFOXO", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"\\n\\nThis could be expressed simply as\"] = {\"prompt\":\"\"\"{}\\n\\nThis could be expressed simply as\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vTt8ca6LGZhh", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"TLDR:\"] = {\"prompt\":\"\"\"{}\\nTLDR:\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "xS4pfE_vGmAe", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"tl;dr\"] = {\"prompt\":\"\"\"{}\\ntl;dr\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "SheB3nAcHMs2", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"tl;dr:\"] = {\"prompt\":\"\"\"{}\\ntl;dr:\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ztWmiXRDIoFw", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"paragraph\"] = {\"prompt\":\"\"\"Read the following text:\n", | |
"{}\n", | |
"\n", | |
"Summarize the text in one paragraph:\n", | |
"\"\"\", \"results\":{}}" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "HTL2j2PigJmo", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"gettysburg = \"\"\"Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.\n", | |
"\n", | |
"Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.\"\"\"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "jPfR-WSsIs_B", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "91912215-6401-4734-ca77-86850c5e299a" | |
}, | |
"source": [ | |
"prompts[\"sentence\"] = {\"prompt\":\"\"\"Read the following text:\n", | |
"{}\n", | |
"\n", | |
"Summarize the text in one sentence:\n", | |
"\"\"\", \"results\":{}}\n", | |
"query(testPrompt.format(gettysburg))" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'The Emancipation Proclamation was a beacon of hope for the slaves.'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 156 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "2jIWt3CDdmHL", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "41165ca0-eb48-4c78-fa14-25fc93c0fd9d" | |
}, | |
"source": [ | |
"prompts[\"sentence2\"] = {\"prompt\":'Read the following text:\\n\"\"\"\\n{}\\n\"\"\"\\n\\nSummarize the text in one sentence:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'The Emancipation Proclamation was a beacon of hope for the slaves.'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 181 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "EKYzyHC4ee-N", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"sentence3\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in one sentence:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "U0hMt_qJh7oV", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"paragraph2\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in one paragraph:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cF4BgKiSiAux", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"two_sentences\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in two sentences:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "raLYbqlQh-fW", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"two_paragraph\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in two paragraphs:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "REDAgaC1iFto", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"few_sentences\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in a few sentences:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ek-7Bc22iEH1", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"few_paragraph\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in a few paragraphs:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ORe75RztiLMY", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"prompts[\"long\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Give a long summarization of the preceeding text:\\nA:', \"results\":{}}\n", | |
"prompts[\"short\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Give a short summarization of the preceeding text:\\nA:', \"results\":{}}\n", | |
"prompts[\"medium\"] = {\"prompt\":'\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Give a medium length summarization of the preceeding text:\\nA:', \"results\":{}}\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "4qnVlzZAfv7r", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"gettysburg = \"\"\"Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.\n", | |
"\n", | |
"Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.\"\"\"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "QWvl_s92fpp1", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 173 | |
}, | |
"outputId": "d3b8751d-3b13-495e-92ab-3d785d70321d" | |
}, | |
"source": [ | |
"print(prompts[\"sentence3\"][\"prompt\"].format(gettysburg))" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"\"\"\"\n", | |
"Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.\n", | |
"\n", | |
"Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.\n", | |
"\"\"\"\n", | |
"\n", | |
"Q: Summarize the preceeding text in one sentence:\n", | |
"A:\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "6Ja8uw-Qflgk", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 52 | |
}, | |
"outputId": "35c5904d-3320-4d96-9d3a-0d510d50cd8e" | |
}, | |
"source": [ | |
"query(prompts[\"sentence3\"][\"prompt\"].format(gettysburg))" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'The Civil War was fought to determine whether the United States would be a nation dedicated to the proposition that all men are created equal.'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 201 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8GllqXh8dSKF", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 102 | |
}, | |
"outputId": "fdef0183-60d4-4383-b0eb-27daec1b1c90" | |
}, | |
"source": [ | |
"print(prompts[\"sentence\"][\"prompt\"])" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Read the following text:\n", | |
"{}\n", | |
"\n", | |
"Summarize the text in one sentence:\n", | |
"\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "BiOlp4ddNpM6", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"import json, time" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "l06VFXPeNp2d", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"with open(\"rslts.json\", \"w\") as fh:\n", | |
" json.dump(prompts, fh, indent=4)\n", | |
"files.download('rslts.json') \n", | |
"\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "TO9imk7xFemH", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"idx = 0\n", | |
"for prompt in prompts:\n", | |
" print(prompt)\n", | |
" idx = 0\n", | |
" for text in df.text: \n", | |
" idx += 1\n", | |
" print(\"{}\".format(idx))\n", | |
" if text in prompts[prompt][\"results\"]:\n", | |
" continue\n", | |
" prompts[prompt][\"results\"][text] = query(prompts[prompt][\"prompt\"].format(text))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1i8LYs3HOVhN", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "c4ed5075-d9eb-49be-e8ea-8103b1b0c020" | |
}, | |
"source": [ | |
"with open(\"rslts.json\", \"w\") as fh:\n", | |
" json.dump(prompts, fh, indent=4)\n", | |
"files.download('rslts.json') \n" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" async function download(id, filename, size) {\n", | |
" if (!google.colab.kernel.accessAllowed) {\n", | |
" return;\n", | |
" }\n", | |
" const div = document.createElement('div');\n", | |
" const label = document.createElement('label');\n", | |
" label.textContent = `Downloading \"${filename}\": `;\n", | |
" div.appendChild(label);\n", | |
" const progress = document.createElement('progress');\n", | |
" progress.max = size;\n", | |
" div.appendChild(progress);\n", | |
" document.body.appendChild(div);\n", | |
"\n", | |
" const buffers = [];\n", | |
" let downloaded = 0;\n", | |
"\n", | |
" const channel = await google.colab.kernel.comms.open(id);\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
"\n", | |
" for await (const message of channel.messages) {\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
" if (message.buffers) {\n", | |
" for (const buffer of message.buffers) {\n", | |
" buffers.push(buffer);\n", | |
" downloaded += buffer.byteLength;\n", | |
" progress.value = downloaded;\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
" const a = document.createElement('a');\n", | |
" a.href = window.URL.createObjectURL(blob);\n", | |
" a.download = filename;\n", | |
" div.appendChild(a);\n", | |
" a.click();\n", | |
" div.remove();\n", | |
" }\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"download(\"download_ece8e904-c95b-43f7-bfbc-03147b5c5ec5\", \"rslts.json\", 2889033)" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "v8QxMY9Sbhxt", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
" dfs = []\n", | |
"for prompt in prompts:\n", | |
" dfs.append(pd.DataFrame(prompts[prompt]).reset_index().rename(columns={\"results\":prompt}))" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "irbMKCt2coXj", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"pd.set_option('display.max_columns', None)\n", | |
"pd.set_option(\"max_rows\", None)\n" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "yeYDTXUResoz", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"# Warning: this commented block is wrong I think. What you need to do instead is do the following" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "U0P9LBR9M70z", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"# all_results = pd.concat([df] + ([x[[x.columns[-1]]].fillna(\"-1\") for x in dfs]), axis=1).dropna()\n", | |
"all_results = df[[\"url\", \"text\"]].join(pd.concat(dfs, axis=1), on=\"text\")" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "7V4dE-LyNMID", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 17 | |
}, | |
"outputId": "075de8e3-53d4-4586-fb98-9526b3aeb082" | |
}, | |
"source": [ | |
"all_results.to_json(\"all_summary_prompts_results.json\", orient=\"records\")\n", | |
"files.download('all_summary_prompts_results.json') " | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"\n", | |
" async function download(id, filename, size) {\n", | |
" if (!google.colab.kernel.accessAllowed) {\n", | |
" return;\n", | |
" }\n", | |
" const div = document.createElement('div');\n", | |
" const label = document.createElement('label');\n", | |
" label.textContent = `Downloading \"${filename}\": `;\n", | |
" div.appendChild(label);\n", | |
" const progress = document.createElement('progress');\n", | |
" progress.max = size;\n", | |
" div.appendChild(progress);\n", | |
" document.body.appendChild(div);\n", | |
"\n", | |
" const buffers = [];\n", | |
" let downloaded = 0;\n", | |
"\n", | |
" const channel = await google.colab.kernel.comms.open(id);\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
"\n", | |
" for await (const message of channel.messages) {\n", | |
" // Send a message to notify the kernel that we're ready.\n", | |
" channel.send({})\n", | |
" if (message.buffers) {\n", | |
" for (const buffer of message.buffers) {\n", | |
" buffers.push(buffer);\n", | |
" downloaded += buffer.byteLength;\n", | |
" progress.value = downloaded;\n", | |
" }\n", | |
" }\n", | |
" }\n", | |
" const blob = new Blob(buffers, {type: 'application/binary'});\n", | |
" const a = document.createElement('a');\n", | |
" a.href = window.URL.createObjectURL(blob);\n", | |
" a.download = filename;\n", | |
" div.appendChild(a);\n", | |
" a.click();\n", | |
" div.remove();\n", | |
" }\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
}, | |
{ | |
"output_type": "display_data", | |
"data": { | |
"application/javascript": [ | |
"download(\"download_619a5e7a-5a50-478c-8671-d89e54c5f864\", \"all_summary_prompts_results.json\", 658458)" | |
], | |
"text/plain": [ | |
"<IPython.core.display.Javascript object>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
} | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wV-QYdewzo35", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"cols = [\"text\"]+list(all_results.columns[4:])" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "WMMWKKv7M-qa", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"base_df = pd.DataFrame(all_results[cols].applymap(lambda x: len(x)).mean(), columns=[\"mean_length\"])" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cE5Fdpb3NDHU", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"for column in all_results.columns[4:]:\n", | |
" base_df.at[column, \"prompt\"] = prompts[column][\"prompt\"]" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-t7UnmBR4ThJ", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"for column in all_results.columns[4:]:\n", | |
" base_df.at[column, \">0 len results\"] = all_results[column][all_results[column].apply(lambda x: len(x) > 0)].count()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FKKtBpvv4uV6", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"for column in all_results.columns[4:]:\n", | |
" base_df.at[column, \"(>0 len) mean\"] = all_results[column][all_results[column].apply(lambda x: len(x) > 0)].apply(lambda x: len(x)).mean()" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "-uGrodCw4s2E", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "aa68ee36-44c7-4862-887c-d5ee17588b4a" | |
}, | |
"source": [ | |
"kwargs" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'engine': 'davinci', 'max_tokens': 300, 'stop': '\\n', 'temperature': 0}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 274 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "I_waxlto5K5u", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 71 | |
}, | |
"outputId": "333c404e-8f2f-4760-b722-7d3103a525da" | |
}, | |
"source": [ | |
"pd.set_option('display.max_colwidth', -1)\n" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n", | |
" \"\"\"Entry point for launching an IPython kernel.\n" | |
], | |
"name": "stderr" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "eH0xCISZNKI0", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"result_table = base_df[[\"prompt\"] + [x for x in base_df.columns if x != \"prompt\"]].set_index(\"prompt\", drop=True)" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "SpdL-G0WhXbw", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 377 | |
}, | |
"outputId": "57ec4011-de7d-4db6-f61a-7cb1557b942f" | |
}, | |
"source": [ | |
"print(\"|| prompt || {} ||\".format(\"||\".join(result_table.columns)))\n", | |
"for row in result_table.iterrows():\n", | |
" print(\"|| {} || {} ||\".format(repr(row[0]), \"||\".join([str(np.round(x)) for x in row[1]])))" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"|| prompt || mean_length||>0 len results||(>0 len) mean ||\n", | |
"|| nan || 782.0||nan||nan ||\n", | |
"|| '{}\\nThis boils down to the simple idea that' || 350.0||152.0||350.0 ||\n", | |
"|| '{}\\n\\nThis boils down to the simple idea that' || 308.0||152.0||308.0 ||\n", | |
"|| '{}\\n\\nThis could be expressed simply as' || 251.0||152.0||251.0 ||\n", | |
"|| '{}\\nTLDR:' || 148.0||120.0||188.0 ||\n", | |
"|| '{}\\ntl;dr' || 142.0||148.0||145.0 ||\n", | |
"|| '{}\\ntl;dr:' || 122.0||124.0||149.0 ||\n", | |
"|| 'Read the following text:\\n{}\\n\\nSummarize the text in one paragraph:\\n' || 15.0||6.0||388.0 ||\n", | |
"|| 'Read the following text:\\n{}\\n\\nSummarize the text in one sentence:\\n' || 5.0||8.0||94.0 ||\n", | |
"|| 'Read the following text:\\n\"\"\"\\n{}\\n\"\"\"\\n\\nSummarize the text in one sentence:' || 0.0||0.0||nan ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in one sentence:\\nA:' || 104.0||152.0||104.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in one paragraph:\\nA:' || 250.0||149.0||255.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in two sentences:\\nA:' || 104.0||150.0||106.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in two paragraphs:\\nA:' || 253.0||140.0||275.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in a few sentences:\\nA:' || 202.0||149.0||206.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Summarize the preceeding text in a few paragraphs:\\nA:' || 312.0||150.0||316.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Give a long summarization of the preceeding text:\\nA:' || 135.0||150.0||137.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Give a short summarization of the preceeding text:\\nA:' || 132.0||149.0||135.0 ||\n", | |
"|| '\"\"\"\\n{}\\n\"\"\"\\n\\nQ: Give a medium length summarization of the preceeding text:\\nA:' || 204.0||147.0||211.0 ||\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "yQtaLL15_ojI", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "4518df61-4963-4fd1-ed6f-c6d04f02688d" | |
}, | |
"source": [ | |
"len(result_table)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"19" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 295 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "9kBFLSWV_qam", | |
"colab_type": "code", | |
"colab": {} | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment