portableant/scrapetosolr.py

## scrapetosolr.py
import sys
from bs4 import BeautifulSoup
import solr
import hashlib
import urllib.request
import xml.etree.ElementTree as ET

limit = 0 # How many iterations max?  Enter 0 for no limit.
solrUrl = 'solrURL' # The URL of the solr instance
sitemaps_ns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # The xmlns for the sitemap schema
if len(sys.argv) != 2:
    print('Usage: ./sitemap.py path')
    sys.exit(1)
sitemapTree = ET.parse(sys.argv[1])

solrInstance = solr.SolrConnection(solrUrl) # Solr Connection object
print(solrInstance)
counter = 0
numAdded = 0

# Find all of the URLs in the form <url>...<loc>URL</loc>...</url>
for urlElem in sitemapTree.findall('{%s}url/{%s}loc'%(sitemaps_ns,sitemaps_ns)):
    counter = counter + 1 # Increment counter

    if limit > 0 and counter > limit:
        break; # For testing, you can set a limit to how many pages of the sitemap to consider

    url = urlElem.text # Get the url text from the element
    print(url)
    try:
        with urllib.request.urlopen(url) as response:
            html = response.read().decode('utf8')
    except:
        print("Error: Cannot get content from URL: "+url)
        continue # Cannot get HTML.  Skip.
    try:
        soup = BeautifulSoup(html, 'lxml') # Try to parse the HTML of the page
    except:
        print("Error: Cannot parse HTML from URL: "+url)
        continue # Cannot parse HTML.  Skip.

    if soup.html == None: # Check if there is an <html> tag
        print("Error: No HTML tag found at URL: "+url)
        continue #No <html> tag.  Skip.

    try:
        title = soup.find('title').string # Try to set the title
        print(title)
    except:
        print("Error: Could not parse title tag found at URL: "+url)
        continue #Could not parse <title> tag.  Skip.

    try:
        body = soup.find_all(class_="rounded")[0].text
    except:
        print("Error: Could not parse body tag found at URL: "+url)
        continue #Could not parse <body> tag.  Skip.

    # Note, decode("utf-8") is used to avoid non-ascii characters in the solrInstance.add below

    # Get an md5 hash of the url for the unique id
    url_md5 = hashlib.md5(url.encode()).hexdigest()

    try:
        # Add to the Solr instance
        solrInstance.add(id=url_md5,url=url,body=body,title=title,contentType='do-not-touch')
        print(id)
        print(url)
    except Exception as inst:
        print("Error adding URL: "+url)
        print("\tWith Message: "+str(inst))
    else:
        print("Added Page \""+title+"\" with URL "+url)
        numAdded = numAdded + 1

try:
    solrInstance.commit() # Commit the additions
except:
    print("Could not Commit Changes to SOLR Instance - Check SOLR logs for more info")
else:
    print("Success. " + str(numAdded)+ " documents added to index")
	import sys
	from bs4 import BeautifulSoup
	import solr
	import hashlib
	import urllib.request
	import xml.etree.ElementTree as ET

	limit = 0 # How many iterations max? Enter 0 for no limit.
	solrUrl = 'solrURL' # The URL of the solr instance
	sitemaps_ns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # The xmlns for the sitemap schema
	if len(sys.argv) != 2:
	print('Usage: ./sitemap.py path')
	sys.exit(1)
	sitemapTree = ET.parse(sys.argv[1])

	solrInstance = solr.SolrConnection(solrUrl) # Solr Connection object
	print(solrInstance)
	counter = 0
	numAdded = 0

	# Find all of the URLs in the form <url>...<loc>URL</loc>...</url>
	for urlElem in sitemapTree.findall('{%s}url/{%s}loc'%(sitemaps_ns,sitemaps_ns)):
	counter = counter + 1 # Increment counter

	if limit > 0 and counter > limit:
	break; # For testing, you can set a limit to how many pages of the sitemap to consider

	url = urlElem.text # Get the url text from the element
	print(url)
	try:
	with urllib.request.urlopen(url) as response:
	html = response.read().decode('utf8')
	except:
	print("Error: Cannot get content from URL: "+url)
	continue # Cannot get HTML. Skip.
	try:
	soup = BeautifulSoup(html, 'lxml') # Try to parse the HTML of the page
	except:
	print("Error: Cannot parse HTML from URL: "+url)
	continue # Cannot parse HTML. Skip.

	if soup.html == None: # Check if there is an <html> tag
	print("Error: No HTML tag found at URL: "+url)
	continue #No <html> tag. Skip.

	try:
	title = soup.find('title').string # Try to set the title
	print(title)
	except:
	print("Error: Could not parse title tag found at URL: "+url)
	continue #Could not parse <title> tag. Skip.

	try:
	body = soup.find_all(class_="rounded")[0].text
	except:
	print("Error: Could not parse body tag found at URL: "+url)
	continue #Could not parse <body> tag. Skip.

	# Note, decode("utf-8") is used to avoid non-ascii characters in the solrInstance.add below

	# Get an md5 hash of the url for the unique id
	url_md5 = hashlib.md5(url.encode()).hexdigest()

	try:
	# Add to the Solr instance
	solrInstance.add(id=url_md5,url=url,body=body,title=title,contentType='do-not-touch')
	print(id)
	print(url)
	except Exception as inst:
	print("Error adding URL: "+url)
	print("\tWith Message: "+str(inst))
	else:
	print("Added Page \""+title+"\" with URL "+url)
	numAdded = numAdded + 1

	try:
	solrInstance.commit() # Commit the additions
	except:
	print("Could not Commit Changes to SOLR Instance - Check SOLR logs for more info")
	else:
	print("Success. " + str(numAdded)+ " documents added to index")