Skip to content

Instantly share code, notes, and snippets.

@portableant
Last active May 1, 2020 12:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save portableant/489231a3abf616c16c279a76b741a147 to your computer and use it in GitHub Desktop.
Save portableant/489231a3abf616c16c279a76b741a147 to your computer and use it in GitHub Desktop.
Scrape a page to solr instance example
import sys
from bs4 import BeautifulSoup
import solr
import hashlib
import urllib.request
import xml.etree.ElementTree as ET
limit = 0 # How many iterations max? Enter 0 for no limit.
solrUrl = 'solrURL' # The URL of the solr instance
sitemaps_ns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # The xmlns for the sitemap schema
if len(sys.argv) != 2:
print('Usage: ./sitemap.py path')
sys.exit(1)
sitemapTree = ET.parse(sys.argv[1])
solrInstance = solr.SolrConnection(solrUrl) # Solr Connection object
print(solrInstance)
counter = 0
numAdded = 0
# Find all of the URLs in the form <url>...<loc>URL</loc>...</url>
for urlElem in sitemapTree.findall('{%s}url/{%s}loc'%(sitemaps_ns,sitemaps_ns)):
counter = counter + 1 # Increment counter
if limit > 0 and counter > limit:
break; # For testing, you can set a limit to how many pages of the sitemap to consider
url = urlElem.text # Get the url text from the element
print(url)
try:
with urllib.request.urlopen(url) as response:
html = response.read().decode('utf8')
except:
print("Error: Cannot get content from URL: "+url)
continue # Cannot get HTML. Skip.
try:
soup = BeautifulSoup(html, 'lxml') # Try to parse the HTML of the page
except:
print("Error: Cannot parse HTML from URL: "+url)
continue # Cannot parse HTML. Skip.
if soup.html == None: # Check if there is an <html> tag
print("Error: No HTML tag found at URL: "+url)
continue #No <html> tag. Skip.
try:
title = soup.find('title').string # Try to set the title
print(title)
except:
print("Error: Could not parse title tag found at URL: "+url)
continue #Could not parse <title> tag. Skip.
try:
body = soup.find_all(class_="rounded")[0].text
except:
print("Error: Could not parse body tag found at URL: "+url)
continue #Could not parse <body> tag. Skip.
# Note, decode("utf-8") is used to avoid non-ascii characters in the solrInstance.add below
# Get an md5 hash of the url for the unique id
url_md5 = hashlib.md5(url.encode()).hexdigest()
try:
# Add to the Solr instance
solrInstance.add(id=url_md5,url=url,body=body,title=title,contentType='do-not-touch')
print(id)
print(url)
except Exception as inst:
print("Error adding URL: "+url)
print("\tWith Message: "+str(inst))
else:
print("Added Page \""+title+"\" with URL "+url)
numAdded = numAdded + 1
try:
solrInstance.commit() # Commit the additions
except:
print("Could not Commit Changes to SOLR Instance - Check SOLR logs for more info")
else:
print("Success. " + str(numAdded)+ " documents added to index")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment