th3o6a1d/extractor.py

## extractor.py
import sys, csv, os

###
### Author Jason Theobald, MD/MBA Student 2014
### This script reads all SPARCS files with the .DAT extension in the same folder as the script
### Use this to retrieve patients by ICD-9 Code and generate a .csv file
### To run: python extractor.py on Mac OSX or Windows PC (with Python 2.7 installed)
###

### Shows what files are being read.  Will open all .dat files in current directory.
files = [i for i in os.listdir(os.curdir) if i[-3:].lower() == "dat"]
print ""
print "Files to extract from..."
for f in files:
    print f

### Wait for keyboard prompt to show user the files that are being read.
raw_input("Press enter to continue")

### Opens each .dat file
for i in files:
    f = open(i)
    o = open('JOINTS' + str(i[-6:-4]) + '.csv', 'wb')
    output = csv.writer(o, delimiter=',')

    ###
    ### Header of .csv output file
    ### Here you enter the field names that will be the first row of your CSV file.
    ### Type them in order, as shown below.
    ###

    output.writerow(["Type","Age","Zip","County","FacilityID","FacilityName","Principal ICD", "ICD2", "ICD3", "ICD4", "ICD5", "ICD6", "ICD7", "ICD8", "ICD9", "ICD10", "ICD11", "ICD12", "ICD13", "ICD14", "ICD15"])

    ### Refer to the SPARCS code list to tell the script where to look for each item of interest. Remember that python uses 0-based indexing, so you
    ### need to subtract 1 from the start number of all of the SPARCS codes. End number is the same.
    ### For each line in the file, grab the data of interest and load into a variable.
    for line in f:
      age = line[101:104].strip()
      zip = line[164:169].strip()
      county = line[173:175].strip()
      facilityID = line[199:205].strip()
      facilityName = line[206:276].strip()
      PICD = line[1642:1649].strip() # Primary ICD code
      ICD2 = line[1666:1673].strip() # All the other ICD codes
      ICD3 = line[1690:1697].strip()
      ICD4 = line[1714:1721].strip()
      ICD5 = line[1738:1745].strip()
      ICD6 = line[1762:1769].strip()
      ICD7 = line[1786:1793].strip()
      ICD8 = line[1810:1817].strip()
      ICD9 = line[1834:1841].strip()
      ICD10 = line[1858:1865].strip()
      ICD11 = line[1882:1889].strip()
      ICD12 = line[1906:1913].strip()
      ICD13 = line[1930:1937].strip()
      ICD14 = line[1954:1961].strip()
      ICD15 = line[1978:1961].strip()

      ### Enter your collection of ICD-9 codes of interest. In this case, we are looking for hips and knees.
      codelist = [PICD, ICD2, ICD3, ICD4, ICD5, ICD6, ICD7, ICD8, ICD9, ICD10, ICD11, ICD12, ICD13, ICD14, ICD15]
      hiplist = ["8151", "8152", "0070", "0071", "0072", "0073", "8153", "0074", "0075", "0076", "0077"]
      kneelist = ["8154", "8155", "0080", "0081", "0082", "0083", "0084"]

      ### Start by assuming this is not a hip patient or a knee patient.
      hip = False
      knee = False

      ### Go through each code in the code list. If any of them are found in the patient line, hip or knee = True
      for code in codelist:
        if code in hiplist:
          hip = True
        if code in kneelist:
          knee = True

      ### After going through the codes, now you can say whether this is a hip patient, a knee patient, or both.
      ### If it's a hip, it will print the row. Same for knee.
      if hip == True:
        print "Hip"
        output.writerow(["Hip"] + [age,zip,county,facilityID,facilityName]+ codelist)
      if knee == True:
        print "Knee"
        output.writerow(["Knee"] + [age,zip,county,facilityID,facilityName]+ codelist)
	import sys, csv, os

	###
	### Author Jason Theobald, MD/MBA Student 2014
	### This script reads all SPARCS files with the .DAT extension in the same folder as the script
	### Use this to retrieve patients by ICD-9 Code and generate a .csv file
	### To run: python extractor.py on Mac OSX or Windows PC (with Python 2.7 installed)
	###

	### Shows what files are being read. Will open all .dat files in current directory.
	files = [i for i in os.listdir(os.curdir) if i[-3:].lower() == "dat"]
	print ""
	print "Files to extract from..."
	for f in files:
	print f

	### Wait for keyboard prompt to show user the files that are being read.
	raw_input("Press enter to continue")

	### Opens each .dat file
	for i in files:
	f = open(i)
	o = open('JOINTS' + str(i[-6:-4]) + '.csv', 'wb')
	output = csv.writer(o, delimiter=',')

	###
	### Header of .csv output file
	### Here you enter the field names that will be the first row of your CSV file.
	### Type them in order, as shown below.
	###

	output.writerow(["Type","Age","Zip","County","FacilityID","FacilityName","Principal ICD", "ICD2", "ICD3", "ICD4", "ICD5", "ICD6", "ICD7", "ICD8", "ICD9", "ICD10", "ICD11", "ICD12", "ICD13", "ICD14", "ICD15"])

	### Refer to the SPARCS code list to tell the script where to look for each item of interest. Remember that python uses 0-based indexing, so you
	### need to subtract 1 from the start number of all of the SPARCS codes. End number is the same.
	### For each line in the file, grab the data of interest and load into a variable.
	for line in f:
	age = line[101:104].strip()
	zip = line[164:169].strip()
	county = line[173:175].strip()
	facilityID = line[199:205].strip()
	facilityName = line[206:276].strip()
	PICD = line[1642:1649].strip() # Primary ICD code
	ICD2 = line[1666:1673].strip() # All the other ICD codes
	ICD3 = line[1690:1697].strip()
	ICD4 = line[1714:1721].strip()
	ICD5 = line[1738:1745].strip()
	ICD6 = line[1762:1769].strip()
	ICD7 = line[1786:1793].strip()
	ICD8 = line[1810:1817].strip()
	ICD9 = line[1834:1841].strip()
	ICD10 = line[1858:1865].strip()
	ICD11 = line[1882:1889].strip()
	ICD12 = line[1906:1913].strip()
	ICD13 = line[1930:1937].strip()
	ICD14 = line[1954:1961].strip()
	ICD15 = line[1978:1961].strip()

	### Enter your collection of ICD-9 codes of interest. In this case, we are looking for hips and knees.
	codelist = [PICD, ICD2, ICD3, ICD4, ICD5, ICD6, ICD7, ICD8, ICD9, ICD10, ICD11, ICD12, ICD13, ICD14, ICD15]
	hiplist = ["8151", "8152", "0070", "0071", "0072", "0073", "8153", "0074", "0075", "0076", "0077"]
	kneelist = ["8154", "8155", "0080", "0081", "0082", "0083", "0084"]

	### Start by assuming this is not a hip patient or a knee patient.
	hip = False
	knee = False

	### Go through each code in the code list. If any of them are found in the patient line, hip or knee = True
	for code in codelist:
	if code in hiplist:
	hip = True
	if code in kneelist:
	knee = True

	### After going through the codes, now you can say whether this is a hip patient, a knee patient, or both.
	### If it's a hip, it will print the row. Same for knee.
	if hip == True:
	print "Hip"
	output.writerow(["Hip"] + [age,zip,county,facilityID,facilityName]+ codelist)
	if knee == True:
	print "Knee"
	output.writerow(["Knee"] + [age,zip,county,facilityID,facilityName]+ codelist)