kylebarron/hcpcs_scrape.py

## hcpcs_scrape.py
#! /usr/bin/env python3
"""
---------------------------------------------------------------------
Program: hcpcs_scrape.py
Author:  Kyle Barron <barronk@mit.edu>
Created: 2/5/2018
Updated: 2/5/2018
Purpose: Scrape HCPCS codes from the internet
"""

import requests
import pandas as pd
import lxml.html as LH
from bs4 import BeautifulSoup
from time import sleep


def main(outpath='hcpcs.csv'):
    """Scrapes HCPCS codes for all years where the data exists on these sites

    Args:
        outpath: path to export final csv file of codes
    """

    data_list = []
    for year in range(2007, 2018):
        data_list.append(get_hcpcs_codes(year))

    data = pd.concat(data_list)
    data.to_csv(outpath)


def get_hcpcs_codes(year, sleep_time=5):
    """Function to scrape HCPCS codes from
    http://www.icd9data.com/HCPCS/ and https://www.hcpcsdata.com/Codes.

    Args:
        year: Year of codes to scrape. http://www.icd9data.com/HCPCS/ seems to
            have data from 2007 through 2016. https://www.hcpcsdata.com/Codes
            has data for 2017/2018.
        sleep_time: Number of seconds to wait between page loads. A greater
            number puts less pressure on the servers that run these sites.

    Returns:
        DataFrame with HCPCS codes, description and year.
    """

    if year > 2016:
        top_level = 'https://www.hcpcsdata.com/Codes'
        page = requests.get(top_level)
        sleep(sleep_time)
        sub_level_links = LH.fromstring(page.content).xpath('//tr/td/a/@href')
        sub_level_links = [
            'https://www.hcpcsdata.com' + x for x in sub_level_links
        ]

        # sub_level = sub_level_links[0]
        all_leaf_links = []
        for sub_level in sub_level_links:
            page = requests.get(sub_level)
            sleep(sleep_time)
            leaf_links = LH.fromstring(page.content).xpath('//tr/td/a/@href')
            leaf_links = ['https://www.hcpcsdata.com' + x for x in leaf_links]
            all_leaf_links.extend(leaf_links)

        all_codes = {}
        i = 0
        for leaf in all_leaf_links:
            i += 1
            page = requests.get(leaf)
            sleep(sleep_time)
            soup = BeautifulSoup(page.content, 'lxml')

            code = soup.find(class_='identifier16').get_text()
            title = soup.find('h5').get_text()

            all_codes[code] = title

            if i % 20 == 0:
                msg = f'Finished scraping page {i} of {len(all_leaf_links)}'
                msg += f' for year {year}'
                print(msg)

        df = pd.DataFrame.from_dict(all_codes, orient='index')
        df.index.rename('HCPCS Code', inplace=True)
        df.columns = ['Description']

    elif (year >= 2007) & (year <= 2016):
        top_level = f'http://www.icd9data.com/HCPCS/{year}/default.htm'
        page = requests.get(top_level)
        sleep(sleep_time)

        sub_level_links = LH.fromstring(
            page.content).xpath('//ul[@class="codeList"]/li/a/@href')
        sub_level_links = [
            'http://www.icd9data.com' + x for x in sub_level_links
        ]

        all_leaf_links = []
        for sub_level in sub_level_links:
            page = requests.get(sub_level)
            sleep(sleep_time)
            leaf_links = LH.fromstring(
                page.content).xpath('//ul[@class="hcpcs"]/li/span/a/@href')
            leaf_links = ['http://www.icd9data.com' + x for x in leaf_links]
            all_leaf_links.extend(leaf_links)

        all_codes = {}
        i = 0
        for leaf in all_leaf_links:
            i += 1
            page = requests.get(leaf)
            sleep(sleep_time)
            soup = BeautifulSoup(page.content, 'lxml')

            code = soup.find(class_='identifier').get_text()
            title = soup.find('dd').get_text()

            all_codes[code] = title

            if i % 20 == 0:
                print(f'Finished scraping page {i} of {len(all_leaf_links)}')

        df = pd.DataFrame.from_dict(all_codes, orient='index')
        df.index.rename('HCPCS Code', inplace=True)
        df.columns = ['Description']

    else:
        raise ValueError(f'Codes for {year} not provided')

    df['Year'] = year

    return df


main()
	#! /usr/bin/env python3
	"""
	---------------------------------------------------------------------
	Program: hcpcs_scrape.py
	Author: Kyle Barron <barronk@mit.edu>
	Created: 2/5/2018
	Updated: 2/5/2018
	Purpose: Scrape HCPCS codes from the internet
	"""

	import requests
	import pandas as pd
	import lxml.html as LH
	from bs4 import BeautifulSoup
	from time import sleep


	def main(outpath='hcpcs.csv'):
	"""Scrapes HCPCS codes for all years where the data exists on these sites

	Args:
	outpath: path to export final csv file of codes
	"""

	data_list = []
	for year in range(2007, 2018):
	data_list.append(get_hcpcs_codes(year))

	data = pd.concat(data_list)
	data.to_csv(outpath)


	def get_hcpcs_codes(year, sleep_time=5):
	"""Function to scrape HCPCS codes from
	http://www.icd9data.com/HCPCS/ and https://www.hcpcsdata.com/Codes.

	Args:
	year: Year of codes to scrape. http://www.icd9data.com/HCPCS/ seems to
	have data from 2007 through 2016. https://www.hcpcsdata.com/Codes
	has data for 2017/2018.
	sleep_time: Number of seconds to wait between page loads. A greater
	number puts less pressure on the servers that run these sites.

	Returns:
	DataFrame with HCPCS codes, description and year.
	"""

	if year > 2016:
	top_level = 'https://www.hcpcsdata.com/Codes'
	page = requests.get(top_level)
	sleep(sleep_time)
	sub_level_links = LH.fromstring(page.content).xpath('//tr/td/a/@href')
	sub_level_links = [
	'https://www.hcpcsdata.com' + x for x in sub_level_links
	]

	# sub_level = sub_level_links[0]
	all_leaf_links = []
	for sub_level in sub_level_links:
	page = requests.get(sub_level)
	sleep(sleep_time)
	leaf_links = LH.fromstring(page.content).xpath('//tr/td/a/@href')
	leaf_links = ['https://www.hcpcsdata.com' + x for x in leaf_links]
	all_leaf_links.extend(leaf_links)

	all_codes = {}
	i = 0
	for leaf in all_leaf_links:
	i += 1
	page = requests.get(leaf)
	sleep(sleep_time)
	soup = BeautifulSoup(page.content, 'lxml')

	code = soup.find(class_='identifier16').get_text()
	title = soup.find('h5').get_text()

	all_codes[code] = title

	if i % 20 == 0:
	msg = f'Finished scraping page {i} of {len(all_leaf_links)}'
	msg += f' for year {year}'
	print(msg)

	df = pd.DataFrame.from_dict(all_codes, orient='index')
	df.index.rename('HCPCS Code', inplace=True)
	df.columns = ['Description']

	elif (year >= 2007) & (year <= 2016):
	top_level = f'http://www.icd9data.com/HCPCS/{year}/default.htm'
	page = requests.get(top_level)
	sleep(sleep_time)

	sub_level_links = LH.fromstring(
	page.content).xpath('//ul[@class="codeList"]/li/a/@href')
	sub_level_links = [
	'http://www.icd9data.com' + x for x in sub_level_links
	]

	all_leaf_links = []
	for sub_level in sub_level_links:
	page = requests.get(sub_level)
	sleep(sleep_time)
	leaf_links = LH.fromstring(
	page.content).xpath('//ul[@class="hcpcs"]/li/span/a/@href')
	leaf_links = ['http://www.icd9data.com' + x for x in leaf_links]
	all_leaf_links.extend(leaf_links)

	all_codes = {}
	i = 0
	for leaf in all_leaf_links:
	i += 1
	page = requests.get(leaf)
	sleep(sleep_time)
	soup = BeautifulSoup(page.content, 'lxml')

	code = soup.find(class_='identifier').get_text()
	title = soup.find('dd').get_text()

	all_codes[code] = title

	if i % 20 == 0:
	print(f'Finished scraping page {i} of {len(all_leaf_links)}')

	df = pd.DataFrame.from_dict(all_codes, orient='index')
	df.index.rename('HCPCS Code', inplace=True)
	df.columns = ['Description']

	else:
	raise ValueError(f'Codes for {year} not provided')

	df['Year'] = year

	return df


	main()