Skip to content

Instantly share code, notes, and snippets.

@ruanbekker
Created April 30, 2017 13:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ruanbekker/9d41095043f6d38e7e7cf3674b2e5d6f to your computer and use it in GitHub Desktop.
Save ruanbekker/9d41095043f6d38e7e7cf3674b2e5d6f to your computer and use it in GitHub Desktop.
Scrapes URL, Title and Keywords from Nested Sitemap to Elasticsearch
import time
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
es_client = Elasticsearch(['http://search-domain:9200'])
drop_index = es_client.indices.create(index='myindex', ignore=400)
create_index = es_client.indices.delete(index='myindex', ignore=[400, 404])
def urlparser(title, url):
# scrape title
p = {}
tag_names = []
post = title
page = requests.get(post).content
soup = BeautifulSoup(page, 'lxml')
title_name = soup.title.string
# scrape tags
desc = soup.findAll(attrs={"name":"keywords"})
if len(desc) >=1:
tag_names = desc[0]['content'].split(',')
else:
tag_names = []
#pass
# payload for elasticsearch
doc = {
'date': time.strftime("%Y-%m-%d"),
'title': title_name,
'tags': tag_names,
'url': url
}
# ingest payload into elasticsearch
res = es_client.index(index="myindex", doc_type="docs", body=doc)
print(res)
time.sleep(1.5)
sitemap_feed = 'http://www.domain.com/sitemap.xml'
page = requests.get(sitemap_feed)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
urls = [element.text for element in sitemap_index.findAll('loc')]
for xx in urls:
sub_sitemap_feed = xx
sub_page = requests.get(sub_sitemap_feed)
sub_sitemap_index = BeautifulSoup(sub_page.content, 'html.parser')
sub_urls = [element.text for element in sub_sitemap_index.findAll('loc')]
for xxy in sub_urls:
urlparser(xxy, xxy)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment