judges119/search.py

## search.py
from celery import Celery
from elasticsearch import Elasticsearch
import json
from lxml import html
import redis
import requests
import sys
from urllib.parse import urlparse, urlunparse

SITE = 'https://adamogrady.id.au/'
r = redis.Redis(host='localhost', port=6379, db=0)
app = Celery('tasks', broker='pyamqp://guest@localhost//')
es = Elasticsearch()

@app.task
def scrape(link):
    if r.exists(link):
        return str(link)
    page = requests.get(link)
    tree = html.fromstring(page.content)

    links = tree.xpath('//a/@href')
    links = map(clean_url, links)
    text = tree.xpath('//body//text()')
    doc = {
        'link': link,
        'text': ','.join(text)
    }
    res = es.index(index="test-search", doc_type='page', body=doc)
    r.set(link, 1)

    for single_link in links:
        if SITE in single_link and r.exists(single_link) == 0:
            scrape.delay(single_link)
    return str(link)

def clean_url(link):
    split_url = urlparse(link)
    if split_url.netloc == '':
        split_url._replace(netloc=SITE)
    split_url._replace(fragment='')
    return urlunparse(split_url)

scrape.delay(SITE)
	from celery import Celery
	from elasticsearch import Elasticsearch
	import json
	from lxml import html
	import redis
	import requests
	import sys
	from urllib.parse import urlparse, urlunparse

	SITE = 'https://adamogrady.id.au/'
	r = redis.Redis(host='localhost', port=6379, db=0)
	app = Celery('tasks', broker='pyamqp://guest@localhost//')
	es = Elasticsearch()

	@app.task
	def scrape(link):
	if r.exists(link):
	return str(link)
	page = requests.get(link)
	tree = html.fromstring(page.content)

	links = tree.xpath('//a/@href')
	links = map(clean_url, links)
	text = tree.xpath('//body//text()')
	doc = {
	'link': link,
	'text': ','.join(text)
	}
	res = es.index(index="test-search", doc_type='page', body=doc)
	r.set(link, 1)

	for single_link in links:
	if SITE in single_link and r.exists(single_link) == 0:
	scrape.delay(single_link)
	return str(link)

	def clean_url(link):
	split_url = urlparse(link)
	if split_url.netloc == '':
	split_url._replace(netloc=SITE)
	split_url._replace(fragment='')
	return urlunparse(split_url)

	scrape.delay(SITE)