Skip to content

Instantly share code, notes, and snippets.

@kylebarron
Created February 6, 2018 01:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylebarron/2c90fbf79f0aca54e2e92df47dcc35b5 to your computer and use it in GitHub Desktop.
Save kylebarron/2c90fbf79f0aca54e2e92df47dcc35b5 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python3
"""
---------------------------------------------------------------------
Program: hcpcs_scrape.py
Author: Kyle Barron <barronk@mit.edu>
Created: 2/5/2018
Updated: 2/5/2018
Purpose: Scrape HCPCS codes from the internet
"""
import requests
import pandas as pd
import lxml.html as LH
from bs4 import BeautifulSoup
from time import sleep
def main(outpath='hcpcs.csv'):
"""Scrapes HCPCS codes for all years where the data exists on these sites
Args:
outpath: path to export final csv file of codes
"""
data_list = []
for year in range(2007, 2018):
data_list.append(get_hcpcs_codes(year))
data = pd.concat(data_list)
data.to_csv(outpath)
def get_hcpcs_codes(year, sleep_time=5):
"""Function to scrape HCPCS codes from
http://www.icd9data.com/HCPCS/ and https://www.hcpcsdata.com/Codes.
Args:
year: Year of codes to scrape. http://www.icd9data.com/HCPCS/ seems to
have data from 2007 through 2016. https://www.hcpcsdata.com/Codes
has data for 2017/2018.
sleep_time: Number of seconds to wait between page loads. A greater
number puts less pressure on the servers that run these sites.
Returns:
DataFrame with HCPCS codes, description and year.
"""
if year > 2016:
top_level = 'https://www.hcpcsdata.com/Codes'
page = requests.get(top_level)
sleep(sleep_time)
sub_level_links = LH.fromstring(page.content).xpath('//tr/td/a/@href')
sub_level_links = [
'https://www.hcpcsdata.com' + x for x in sub_level_links
]
# sub_level = sub_level_links[0]
all_leaf_links = []
for sub_level in sub_level_links:
page = requests.get(sub_level)
sleep(sleep_time)
leaf_links = LH.fromstring(page.content).xpath('//tr/td/a/@href')
leaf_links = ['https://www.hcpcsdata.com' + x for x in leaf_links]
all_leaf_links.extend(leaf_links)
all_codes = {}
i = 0
for leaf in all_leaf_links:
i += 1
page = requests.get(leaf)
sleep(sleep_time)
soup = BeautifulSoup(page.content, 'lxml')
code = soup.find(class_='identifier16').get_text()
title = soup.find('h5').get_text()
all_codes[code] = title
if i % 20 == 0:
msg = f'Finished scraping page {i} of {len(all_leaf_links)}'
msg += f' for year {year}'
print(msg)
df = pd.DataFrame.from_dict(all_codes, orient='index')
df.index.rename('HCPCS Code', inplace=True)
df.columns = ['Description']
elif (year >= 2007) & (year <= 2016):
top_level = f'http://www.icd9data.com/HCPCS/{year}/default.htm'
page = requests.get(top_level)
sleep(sleep_time)
sub_level_links = LH.fromstring(
page.content).xpath('//ul[@class="codeList"]/li/a/@href')
sub_level_links = [
'http://www.icd9data.com' + x for x in sub_level_links
]
all_leaf_links = []
for sub_level in sub_level_links:
page = requests.get(sub_level)
sleep(sleep_time)
leaf_links = LH.fromstring(
page.content).xpath('//ul[@class="hcpcs"]/li/span/a/@href')
leaf_links = ['http://www.icd9data.com' + x for x in leaf_links]
all_leaf_links.extend(leaf_links)
all_codes = {}
i = 0
for leaf in all_leaf_links:
i += 1
page = requests.get(leaf)
sleep(sleep_time)
soup = BeautifulSoup(page.content, 'lxml')
code = soup.find(class_='identifier').get_text()
title = soup.find('dd').get_text()
all_codes[code] = title
if i % 20 == 0:
print(f'Finished scraping page {i} of {len(all_leaf_links)}')
df = pd.DataFrame.from_dict(all_codes, orient='index')
df.index.rename('HCPCS Code', inplace=True)
df.columns = ['Description']
else:
raise ValueError(f'Codes for {year} not provided')
df['Year'] = year
return df
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment