Created
July 24, 2021 14:03
-
-
Save pdesai878/d30bfe8982a7641e28fdcb6f4319ae9a to your computer and use it in GitHub Desktop.
Web Scraping using requests, beautifulsoup library in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Web Scraping using requests, beautifulsoup library in python", | |
"provenance": [], | |
"collapsed_sections": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/pdesai878/d30bfe8982a7641e28fdcb6f4319ae9a/web-scraping-using-requests-beautifulsoup-library-in-python.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "5RVpCMSEeyUZ" | |
}, | |
"source": [ | |
"# Getting Data from an API" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "gqu4CcO0VFpn" | |
}, | |
"source": [ | |
"import requests\n", | |
"import json" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "GNkZpwTHVHD8" | |
}, | |
"source": [ | |
"url = \"https://www.boredapi.com/api/activity\"" | |
], | |
"execution_count": 3, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "FRIy2CwgVMeC" | |
}, | |
"source": [ | |
"res = requests.get(url)" | |
], | |
"execution_count": 4, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "EpPj7XXYVOea", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "39455c97-56c2-452b-a0a7-0f551d159d08" | |
}, | |
"source": [ | |
"res.status_code" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"200" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "DMoCqpcgVQnM", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "e9236b88-66d3-45a7-b398-842d9aa0c1fa" | |
}, | |
"source": [ | |
"print(res.text)" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"{\"activity\":\"Go to the gym\",\"type\":\"recreational\",\"participants\":1,\"price\":0.2,\"link\":\"\",\"key\":\"4387026\",\"accessibility\":0.1}\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "00axvBl2VTGI" | |
}, | |
"source": [ | |
"result = json.loads(res.text)" | |
], | |
"execution_count": 7, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "YopeUfjaVY2A", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "2a01500e-0717-4af9-e6e5-dd7c752550a9" | |
}, | |
"source": [ | |
"print(result)" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"{'activity': 'Go to the gym', 'type': 'recreational', 'participants': 1, 'price': 0.2, 'link': '', 'key': '4387026', 'accessibility': 0.1}\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "G8DlnBORVbPE", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "c01964ba-c491-4886-c6a6-5f44737c359f" | |
}, | |
"source": [ | |
"result['activity']" | |
], | |
"execution_count": 9, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'Go to the gym'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 9 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "JrlTNhkcVnJa" | |
}, | |
"source": [ | |
"# Get University Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "Odu091_kVm4B" | |
}, | |
"source": [ | |
"url = \"http://universities.hipolabs.com/search?country=India\"" | |
], | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "llyTIPOGVdyz" | |
}, | |
"source": [ | |
"res = requests.get(url)" | |
], | |
"execution_count": 11, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UlCW-gtAV-me", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "1e654848-4be5-4702-9652-6ac1e3472b21" | |
}, | |
"source": [ | |
"res.status_code" | |
], | |
"execution_count": 12, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"200" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 12 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "giM0JeWPV_lC" | |
}, | |
"source": [ | |
"universities = json.loads(res.text)" | |
], | |
"execution_count": 16, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "0PW_L7x0WBp8", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "d8ab7147-6a6b-48c7-ab2d-75026f030bf7" | |
}, | |
"source": [ | |
"len(universities)" | |
], | |
"execution_count": 14, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"800" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 14 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "kshs1Ub-WEDg", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "5229cc93-eec5-4672-8760-290d26fd3de6" | |
}, | |
"source": [ | |
"universities[100]" | |
], | |
"execution_count": 17, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"{'alpha_two_code': 'IN',\n", | |
" 'country': 'India',\n", | |
" 'domains': ['gurukuluniversity.org'],\n", | |
" 'name': 'Gurukul University',\n", | |
" 'state-province': None,\n", | |
" 'web_pages': ['http://www.gurukuluniversity.org/']}" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 17 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "bk2D-CNQZHKK" | |
}, | |
"source": [ | |
"# Scraping Quotes Data\n", | |
"**library used : BeautifulSoup**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "8RfmKyEQZG03" | |
}, | |
"source": [ | |
"import bs4\n", | |
"import requests" | |
], | |
"execution_count": 18, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "F-Nv1_DEZS9I" | |
}, | |
"source": [ | |
"url = \"https://quotes.toscrape.com/page/1\"" | |
], | |
"execution_count": 19, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "k6WwnvCRZYZd" | |
}, | |
"source": [ | |
"res = requests.get(url)" | |
], | |
"execution_count": 20, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "cq6t88W-ZaDL", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "d5690b7e-5e70-47f8-f9e9-0083d3c2923e" | |
}, | |
"source": [ | |
"res.status_code" | |
], | |
"execution_count": 21, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"200" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 21 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "wwKLb7YJZbi7" | |
}, | |
"source": [ | |
"html = res.text" | |
], | |
"execution_count": 22, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "AvgmdPGoZdnR" | |
}, | |
"source": [ | |
"soup = bs4.BeautifulSoup(html)" | |
], | |
"execution_count": 24, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "81YTc2oKZhpQ", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "ece6462a-72d7-4f5d-fcae-8045bb01c049" | |
}, | |
"source": [ | |
"type(soup)" | |
], | |
"execution_count": 25, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"bs4.BeautifulSoup" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 25 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "RyoLX2kuZoZK", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "b7311fb5-5b74-4cca-ef1f-081823e9c3a2" | |
}, | |
"source": [ | |
"soup.title" | |
], | |
"execution_count": 26, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<title>Quotes to Scrape</title>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 26 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "u5FCibzQZwvs", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 35 | |
}, | |
"outputId": "01061066-8e86-4a2f-dbc4-bdac638eeb76" | |
}, | |
"source": [ | |
"soup.h1.text" | |
], | |
"execution_count": 27, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"application/vnd.google.colaboratory.intrinsic+json": { | |
"type": "string" | |
}, | |
"text/plain": [ | |
"'\\nQuotes to Scrape\\n'" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 27 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "vf4k8gqFZ8Qh" | |
}, | |
"source": [ | |
"box = soup.find('div', attrs={\"class\": \"quote\"} )" | |
], | |
"execution_count": 28, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UeWVhcCbaVGL", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "4f311e77-b823-4d97-bc66-da04f035b3de" | |
}, | |
"source": [ | |
"box" | |
], | |
"execution_count": 29, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<div class=\"quote\" itemscope=\"\" itemtype=\"http://schema.org/CreativeWork\">\n", | |
"<span class=\"text\" itemprop=\"text\">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>\n", | |
"<span>by <small class=\"author\" itemprop=\"author\">Albert Einstein</small>\n", | |
"<a href=\"/author/Albert-Einstein\">(about)</a>\n", | |
"</span>\n", | |
"<div class=\"tags\">\n", | |
" Tags:\n", | |
" <meta class=\"keywords\" content=\"change,deep-thoughts,thinking,world\" itemprop=\"keywords\"/>\n", | |
"<a class=\"tag\" href=\"/tag/change/page/1/\">change</a>\n", | |
"<a class=\"tag\" href=\"/tag/deep-thoughts/page/1/\">deep-thoughts</a>\n", | |
"<a class=\"tag\" href=\"/tag/thinking/page/1/\">thinking</a>\n", | |
"<a class=\"tag\" href=\"/tag/world/page/1/\">world</a>\n", | |
"</div>\n", | |
"</div>" | |
] | |
}, | |
"metadata": { | |
"tags": [] | |
}, | |
"execution_count": 29 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "EAEdE1nja6Sc" | |
}, | |
"source": [ | |
"quote = box.span.text" | |
], | |
"execution_count": 30, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "bLIsv67Na8zc" | |
}, | |
"source": [ | |
"author = box.find('small').text" | |
], | |
"execution_count": 31, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "yOW4qb3QbH2C", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "e9898aa8-03eb-4666-bff5-455f32b55997" | |
}, | |
"source": [ | |
"print(quote)\n", | |
"print(\"-by\",author)" | |
], | |
"execution_count": 32, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”\n", | |
"-by Albert Einstein\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "VnXpMZ0LbeGH" | |
}, | |
"source": [ | |
"# Scrape All the Quotes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "lfzWqniubd14" | |
}, | |
"source": [ | |
"all_boxes = soup.find_all('div', attrs={\"class\": \"quote\"} )" | |
], | |
"execution_count": 33, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IILG780qbjTr", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"outputId": "e21e9673-3af6-4f07-f44e-0003ad329712" | |
}, | |
"source": [ | |
"for one_box in all_boxes:\n", | |
" print(\"==\"*50)\n", | |
" quote = one_box.span.text\n", | |
" author = one_box.find('small').text\n", | |
"\n", | |
" print(quote)\n", | |
" print(\"- by\",author)\n" | |
], | |
"execution_count": 34, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"====================================================================================================\n", | |
"“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”\n", | |
"- by Albert Einstein\n", | |
"====================================================================================================\n", | |
"“It is our choices, Harry, that show what we truly are, far more than our abilities.”\n", | |
"- by J.K. Rowling\n", | |
"====================================================================================================\n", | |
"“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”\n", | |
"- by Albert Einstein\n", | |
"====================================================================================================\n", | |
"“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”\n", | |
"- by Jane Austen\n", | |
"====================================================================================================\n", | |
"“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”\n", | |
"- by Marilyn Monroe\n", | |
"====================================================================================================\n", | |
"“Try not to become a man of success. Rather become a man of value.”\n", | |
"- by Albert Einstein\n", | |
"====================================================================================================\n", | |
"“It is better to be hated for what you are than to be loved for what you are not.”\n", | |
"- by André Gide\n", | |
"====================================================================================================\n", | |
"“I have not failed. I've just found 10,000 ways that won't work.”\n", | |
"- by Thomas A. Edison\n", | |
"====================================================================================================\n", | |
"“A woman is like a tea bag; you never know how strong it is until it's in hot water.”\n", | |
"- by Eleanor Roosevelt\n", | |
"====================================================================================================\n", | |
"“A day without sunshine is like, you know, night.”\n", | |
"- by Steve Martin\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "dIqnU5UVb3IU" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment