Skip to content

Instantly share code, notes, and snippets.

@Wesitos
Created May 7, 2017 01:00
Show Gist options
  • Save Wesitos/840dccc42ccc4eb372f0325560b9b2f3 to your computer and use it in GitHub Desktop.
Save Wesitos/840dccc42ccc4eb372f0325560b9b2f3 to your computer and use it in GitHub Desktop.
Ejemplo de scraping usando aiohttp y BeautifulSoup
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import aiohttp as req\n",
"import asyncio as aio\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# async def get_page(url)\n",
"@aio.coroutine\n",
"def get_page(url):\n",
" # await req.get(url)\n",
" res = yield from req.get(url)\n",
" return (yield from res.text())\n",
"\n",
"@aio.coroutine\n",
"def main():\n",
" text = yield from get_page(\"http://elcomercio.pe/tecnologia?ref=portada_home&ft=menu_nav\")\n",
" return text"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Asi se ejecuta una corutina"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<!doctype html>\n",
"<html lang=\"es\">\n",
" <head>\n",
" <meta meta charset=\"utf-8\">\n",
" <meta meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge\">\n",
"\t <title>Noticias de Tecnología | Galaxy S8 | Samsung | Apple | Note 7 | Nintendo Switch | iPhone 7 | MWC 2017 | Apps | Android | iOS | Pokemon Go | Cyber Monday | Black Friday | Macbook | Google | Pixel | Nintendo | YouTube | Facebook | WhatsApp | PS4 | Xbox | Doodle | Samsung Galaxy | Instagram | Linkedin | WiFi | Pinterest | Snapchat | Mark Zuckerberg | Redes sociales| Netflix | Dota 2 | El Comercio Peru</title>\n",
" <link rel=\"dns-prefetch\" href=\"https://cdn.elcomercio.e3.pe/\">\n",
" <link rel=\"dns-prefetch\" href=\"http://cde.3.elcomercio.pe/\">\n",
" <link rel=\"dns-prefetch\" href=\"http://code2.adtlgc.com/\"/>\n",
" <meta name=\"keywords\" content=\"Tecnología, Facebook, YouTube, Google, Doodle, Apple, Instagram, Linkedln, WiFi, Pinterest, Snapchat, WhatsApp, Drones, Redes, sociales, Netf\n"
]
}
],
"source": [
"loop = aio.get_event_loop()\n",
"text = loop.run_until_complete(main())\n",
"print(text[:1000])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Podemos hacer una funcion"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def call_coroutine(coro):\n",
" loop = aio.get_event_loop()\n",
" return loop.run_until_complete(coro())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analizamos la respuesta"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"soup = BeautifulSoup(text, 'lxml')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sections = soup.find_all(class_='ec-ultimas')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"articles = []\n",
"for section in sections:\n",
" articles.extend(section.find_all('article'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def parse_article_abstract(soup):\n",
" return {\n",
" 'title': soup.select_one('header h2 a').text,\n",
" 'abstract': soup.find('p').text,\n",
" 'img': soup.select_one('figure img').attrs.get('src')\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = [parse_article_abstract(s) \n",
" for s in articles\n",
" # Hay un article que no cumple la estructura\n",
" if s.select_one('header h2') is not None\n",
"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Automatizamos"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"@aio.coroutine\n",
"def main():\n",
" text = yield from get_page(\"http://elcomercio.pe/tecnologia?ref=portada_home&ft=menu_nav\")\n",
" articles = [parse_article_abstract(s) for s in soup.select('.ec-ultimas article')\n",
" if s.select_one('header h2') is not None\n",
" ]\n",
" return articles"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'abstract': 'El youtuber Rudy Mancuso escenificó una pelea callejera donde los sonidos de los golpes formaron un ritmo musical peculiar',\n",
" 'img': 'http://cde.3.elcomercio.pe/ima/0/1/6/8/1/1681826/160x100.jpg',\n",
" 'title': \"Una 'pelea musical' es el viral de la semana en YouTube\"}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = call_coroutine(main)\n",
"data[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.4",
"language": "python",
"name": "python3.4"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment