Download Fama French industry classifications
from io import BytesIO | |
from zipfile import ZipFile | |
import requests | |
def download_ffind_zip(ind_num): | |
zip_url = ('http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes{}.zip' | |
.format(ind_num)) | |
data = requests.get(zip_url) | |
zipfile = ZipFile(BytesIO(data.content)) | |
return zipfile.open('Siccodes{}.txt'.format(ind_num)).read().decode() | |
def get_ffind_df(ind_num): | |
if ind_num not in [5, 10, 12, 17, 30, 38, 48, 49]: | |
raise ValueError('Industry number must be one of {} not {}.' | |
.format([5, 10, 12, 17, 30, 38, 48, 49], ind_num)) | |
re_nameline = re.compile(r'^\s*(?P<ff{0}>\d\d?)\s+(?P<ff{0}_name>[a-z]+)\s+(?P<detail>.+)\s*$' | |
.format(ind_num), re.I|re.M) | |
re_rangeline = re.compile(r'^\s*(?P<sicfrom>\d{3,4})-(?P<sicto>\d{3,4})(?P<notes>\s+.+)?\s*$', re.I|re.M) | |
data = download_ffind_zip(ind_num) | |
# init to 'other' | |
try: | |
current_ind = [_.groupdict() for _ in re_nameline.finditer(data) | |
if _.group('ff{0}_name'.format(ind_num)).lower() == 'other'][0] | |
except IndexError: | |
current_ind = {'ff{0}'.format(ind_num):ind_num, | |
'ff{0}_name'.format(ind_num):'Other', | |
'detail':''} | |
vals = {i:current_ind for i in range(10000)} | |
for line in data.split('\n'): | |
match = re_nameline.search(line.strip()) | |
if match: | |
current_ind = match.groupdict() | |
continue | |
match = re_rangeline.search(line.strip()) | |
if not match: | |
continue | |
match = match.groupdict() | |
sicfrom,sicto = int(match['sicfrom']), int(match['sicto']) | |
for i in range(sicfrom, sicto+1): | |
vals[i] = current_ind | |
df = pd.DataFrame.from_dict(vals, orient='index') | |
df.index.name = 'sic' | |
df['ff{0}'.format(ind_num)] = df['ff{0}'.format(ind_num)].astype(int) | |
return df.reset_index() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment