geekygirldawn/chaoss_urls.py

## chaoss_urls.py
# I created a super hacky Jupyter Notebook that looks at
# https://chaoss.community/kbtopic/all-metrics/ which
# contains a list of all of our metrics, and then uses
# BeautifulSoup + hackiness to extract the short link
# from the wordpress pages. I was in a hurry, so I
# printed the csv output to the screen instead of to
# a csv file.

import urllib.request as urllib2
from bs4 import BeautifulSoup

# Read and parse the file that contains all of the metrics
response = urllib2.urlopen('https://chaoss.community/kbtopic/all-metrics/')
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')

# Find html links for all of the individual metrics pages
links = soup.find_all("h2", {"class": "mkb-entry-title"})

# Loop through those links and read / parse those pages
for div in links:
    links = div.findAll('a')
    for a in links:
        try:
            #Get the human readable URL and use it to read the page
            metric_url = a['href']

            # Read / Parse the metrics page
            metric_response = urllib2.urlopen(metric_url)
            metric_html_doc = metric_response.read()
            metric_soup = BeautifulSoup(metric_html_doc, 'html.parser')

            # Find the shortlink aka permalink
            short_link = metric_soup.find("link",{"rel":"shortlink"})['href']

            # Hacky way to grab the details about the focus areas & github url
            # out of some debug stuff that is in the html pages and convert it
            # to a string to work with it.
            debug = metric_soup.find("div", {"class": "wpb_row"})
            debug_str = str(debug)

            # Finds where the focus areas are mentioned and does some hacky stuff
            # to grab the next 500 chars after that.
            index = str(debug).find('focus-areas')
            index_end = index + 500

            # Splits the string using single quotes and grabs the first
            # part of it, which will have the actual focus area
            focus_str = debug_str[index:index_end].split("'")[0]

            # Split it again by single quote to grab the string at index 2 and
            # use the string that starts at index 15 to remove "Debug Objects: "
            # from the front of the GitHub URL
            gh_url = debug_str[index:index_end].split("'")[2][15:].split('"')[0]

            # As a quick way to get the csv, print each line in csv format to
            # the screen to be copied into a csv file. Hacky, but quick.
            line = metric_url + ',' + short_link + ',' + gh_url + ',' + focus_str
            print(line)
        except:
            # Print a message if something went wrong and avoid crashing the script
            print(a['href'], 'not parsed')
	# I created a super hacky Jupyter Notebook that looks at
	# https://chaoss.community/kbtopic/all-metrics/ which
	# contains a list of all of our metrics, and then uses
	# BeautifulSoup + hackiness to extract the short link
	# from the wordpress pages. I was in a hurry, so I
	# printed the csv output to the screen instead of to
	# a csv file.

	import urllib.request as urllib2
	from bs4 import BeautifulSoup

	# Read and parse the file that contains all of the metrics
	response = urllib2.urlopen('https://chaoss.community/kbtopic/all-metrics/')
	html_doc = response.read()
	soup = BeautifulSoup(html_doc, 'html.parser')

	# Find html links for all of the individual metrics pages
	links = soup.find_all("h2", {"class": "mkb-entry-title"})

	# Loop through those links and read / parse those pages
	for div in links:
	links = div.findAll('a')
	for a in links:
	try:
	#Get the human readable URL and use it to read the page
	metric_url = a['href']

	# Read / Parse the metrics page
	metric_response = urllib2.urlopen(metric_url)
	metric_html_doc = metric_response.read()
	metric_soup = BeautifulSoup(metric_html_doc, 'html.parser')

	# Find the shortlink aka permalink
	short_link = metric_soup.find("link",{"rel":"shortlink"})['href']

	# Hacky way to grab the details about the focus areas & github url
	# out of some debug stuff that is in the html pages and convert it
	# to a string to work with it.
	debug = metric_soup.find("div", {"class": "wpb_row"})
	debug_str = str(debug)

	# Finds where the focus areas are mentioned and does some hacky stuff
	# to grab the next 500 chars after that.
	index = str(debug).find('focus-areas')
	index_end = index + 500

	# Splits the string using single quotes and grabs the first
	# part of it, which will have the actual focus area
	focus_str = debug_str[index:index_end].split("'")[0]

	# Split it again by single quote to grab the string at index 2 and
	# use the string that starts at index 15 to remove "Debug Objects: "
	# from the front of the GitHub URL
	gh_url = debug_str[index:index_end].split("'")[2][15:].split('"')[0]

	# As a quick way to get the csv, print each line in csv format to
	# the screen to be copied into a csv file. Hacky, but quick.
	line = metric_url + ',' + short_link + ',' + gh_url + ',' + focus_str
	print(line)
	except:
	# Print a message if something went wrong and avoid crashing the script
	print(a['href'], 'not parsed')