ruanbekker/nested-sitemap-to-elasticsearch.py

## nested-sitemap-to-elasticsearch.py
import time
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

es_client = Elasticsearch(['http://search-domain:9200'])

drop_index = es_client.indices.create(index='myindex', ignore=400)
create_index = es_client.indices.delete(index='myindex', ignore=[400, 404])

def urlparser(title, url):
    # scrape title
    p = {}
    tag_names = []
    post = title
    page = requests.get(post).content
    soup = BeautifulSoup(page, 'lxml')
    title_name = soup.title.string

    # scrape tags
    desc = soup.findAll(attrs={"name":"keywords"})
    if len(desc) >=1:
        tag_names = desc[0]['content'].split(',')
    else:
        tag_names = []
    #pass
    # payload for elasticsearch
    doc = {
        'date': time.strftime("%Y-%m-%d"),
        'title': title_name,
        'tags': tag_names,
        'url': url
    }

    # ingest payload into elasticsearch
    res = es_client.index(index="myindex", doc_type="docs", body=doc)
    print(res)
    time.sleep(1.5)

sitemap_feed = 'http://www.domain.com/sitemap.xml'
page = requests.get(sitemap_feed)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
urls = [element.text for element in sitemap_index.findAll('loc')]

for xx in urls:
    sub_sitemap_feed = xx
    sub_page = requests.get(sub_sitemap_feed)
    sub_sitemap_index = BeautifulSoup(sub_page.content, 'html.parser')
    sub_urls = [element.text for element in sub_sitemap_index.findAll('loc')]
    for xxy in sub_urls:
        urlparser(xxy, xxy)
	import time
	import requests
	from bs4 import BeautifulSoup
	from elasticsearch import Elasticsearch

	es_client = Elasticsearch(['http://search-domain:9200'])

	drop_index = es_client.indices.create(index='myindex', ignore=400)
	create_index = es_client.indices.delete(index='myindex', ignore=[400, 404])

	def urlparser(title, url):
	# scrape title
	p = {}
	tag_names = []
	post = title
	page = requests.get(post).content
	soup = BeautifulSoup(page, 'lxml')
	title_name = soup.title.string

	# scrape tags
	desc = soup.findAll(attrs={"name":"keywords"})
	if len(desc) >=1:
	tag_names = desc[0]['content'].split(',')
	else:
	tag_names = []
	#pass
	# payload for elasticsearch
	doc = {
	'date': time.strftime("%Y-%m-%d"),
	'title': title_name,
	'tags': tag_names,
	'url': url
	}

	# ingest payload into elasticsearch
	res = es_client.index(index="myindex", doc_type="docs", body=doc)
	print(res)
	time.sleep(1.5)

	sitemap_feed = 'http://www.domain.com/sitemap.xml'
	page = requests.get(sitemap_feed)
	sitemap_index = BeautifulSoup(page.content, 'html.parser')
	urls = [element.text for element in sitemap_index.findAll('loc')]

	for xx in urls:
	sub_sitemap_feed = xx
	sub_page = requests.get(sub_sitemap_feed)
	sub_sitemap_index = BeautifulSoup(sub_page.content, 'html.parser')
	sub_urls = [element.text for element in sub_sitemap_index.findAll('loc')]
	for xxy in sub_urls:
	urlparser(xxy, xxy)