Skip to content

Instantly share code, notes, and snippets.

@ruanbekker
Created October 14, 2017 21:27
Show Gist options
  • Save ruanbekker/0e08a0ee46cfcfedafdb62efbefe524a to your computer and use it in GitHub Desktop.
Save ruanbekker/0e08a0ee46cfcfedafdb62efbefe524a to your computer and use it in GitHub Desktop.
Python Web Scraper for Octopress which Pushes Data to ES
# centos: libxslt-devel python-devel
# debian:
import re
import time
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
es_client = Elasticsearch(['http://10.0.1.11:9200'])
drop_index = es_client.indices.create(index='myindex-test', ignore=400)
create_index = es_client.indices.delete(index='myindex-test', ignore=[400, 404])
def urlparser(title, url):
# scrape title
p = {}
post = title
page = requests.get(post).content
soup = BeautifulSoup(page, 'lxml')
title_name = soup.title.string
# scrape tags
tag_names = []
desc = soup.findAll(attrs={"class":"category"})
for x in desc:
tag_names.append(x.text)
# payload for elasticsearch
doc = {
'date': time.strftime("%Y-%m-%d"),
'title': title_name,
'tags': tag_names,
'url': url
}
# ingest payload into elasticsearch
res = es_client.index(index="myindex-test", doc_type="docs", body=doc)
#print(doc)
time.sleep(0.5)
sitemap_feed = 'http://blog.ruanbekker.com/sitemap.xml'
page = requests.get(sitemap_feed)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
urls = [element.text for element in sitemap_index.findAll('loc')]
for x in urls:
urlparser(x, x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment