Skip to content

Instantly share code, notes, and snippets.

@judges119
Last active January 16, 2020 00:07
Show Gist options
  • Save judges119/f780b7afab8b10cc856ee2033d0e9224 to your computer and use it in GitHub Desktop.
Save judges119/f780b7afab8b10cc856ee2033d0e9224 to your computer and use it in GitHub Desktop.
Simple search scraper
from celery import Celery
from elasticsearch import Elasticsearch
import json
from lxml import html
import redis
import requests
import sys
from urllib.parse import urlparse, urlunparse
SITE = 'https://adamogrady.id.au/'
r = redis.Redis(host='localhost', port=6379, db=0)
app = Celery('tasks', broker='pyamqp://guest@localhost//')
es = Elasticsearch()
@app.task
def scrape(link):
if r.exists(link):
return str(link)
page = requests.get(link)
tree = html.fromstring(page.content)
links = tree.xpath('//a/@href')
links = map(clean_url, links)
text = tree.xpath('//body//text()')
doc = {
'link': link,
'text': ','.join(text)
}
res = es.index(index="test-search", doc_type='page', body=doc)
r.set(link, 1)
for single_link in links:
if SITE in single_link and r.exists(single_link) == 0:
scrape.delay(single_link)
return str(link)
def clean_url(link):
split_url = urlparse(link)
if split_url.netloc == '':
split_url._replace(netloc=SITE)
split_url._replace(fragment='')
return urlunparse(split_url)
scrape.delay(SITE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment