Last active
May 1, 2020 12:11
-
-
Save portableant/489231a3abf616c16c279a76b741a147 to your computer and use it in GitHub Desktop.
Scrape a page to solr instance example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from bs4 import BeautifulSoup | |
import solr | |
import hashlib | |
import urllib.request | |
import xml.etree.ElementTree as ET | |
limit = 0 # How many iterations max? Enter 0 for no limit. | |
solrUrl = 'solrURL' # The URL of the solr instance | |
sitemaps_ns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # The xmlns for the sitemap schema | |
if len(sys.argv) != 2: | |
print('Usage: ./sitemap.py path') | |
sys.exit(1) | |
sitemapTree = ET.parse(sys.argv[1]) | |
solrInstance = solr.SolrConnection(solrUrl) # Solr Connection object | |
print(solrInstance) | |
counter = 0 | |
numAdded = 0 | |
# Find all of the URLs in the form <url>...<loc>URL</loc>...</url> | |
for urlElem in sitemapTree.findall('{%s}url/{%s}loc'%(sitemaps_ns,sitemaps_ns)): | |
counter = counter + 1 # Increment counter | |
if limit > 0 and counter > limit: | |
break; # For testing, you can set a limit to how many pages of the sitemap to consider | |
url = urlElem.text # Get the url text from the element | |
print(url) | |
try: | |
with urllib.request.urlopen(url) as response: | |
html = response.read().decode('utf8') | |
except: | |
print("Error: Cannot get content from URL: "+url) | |
continue # Cannot get HTML. Skip. | |
try: | |
soup = BeautifulSoup(html, 'lxml') # Try to parse the HTML of the page | |
except: | |
print("Error: Cannot parse HTML from URL: "+url) | |
continue # Cannot parse HTML. Skip. | |
if soup.html == None: # Check if there is an <html> tag | |
print("Error: No HTML tag found at URL: "+url) | |
continue #No <html> tag. Skip. | |
try: | |
title = soup.find('title').string # Try to set the title | |
print(title) | |
except: | |
print("Error: Could not parse title tag found at URL: "+url) | |
continue #Could not parse <title> tag. Skip. | |
try: | |
body = soup.find_all(class_="rounded")[0].text | |
except: | |
print("Error: Could not parse body tag found at URL: "+url) | |
continue #Could not parse <body> tag. Skip. | |
# Note, decode("utf-8") is used to avoid non-ascii characters in the solrInstance.add below | |
# Get an md5 hash of the url for the unique id | |
url_md5 = hashlib.md5(url.encode()).hexdigest() | |
try: | |
# Add to the Solr instance | |
solrInstance.add(id=url_md5,url=url,body=body,title=title,contentType='do-not-touch') | |
print(id) | |
print(url) | |
except Exception as inst: | |
print("Error adding URL: "+url) | |
print("\tWith Message: "+str(inst)) | |
else: | |
print("Added Page \""+title+"\" with URL "+url) | |
numAdded = numAdded + 1 | |
try: | |
solrInstance.commit() # Commit the additions | |
except: | |
print("Could not Commit Changes to SOLR Instance - Check SOLR logs for more info") | |
else: | |
print("Success. " + str(numAdded)+ " documents added to index") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment