djmm187/nasq-earnings-cal-scraper.py

## nasq-earnings-cal-scraper.py
import argparse
import json
import pendulum
import re
import requests

from bs4 import BeautifulSoup


BASE_URL = 'http://www.nasdaq.com/earnings/earnings-calendar.aspx'
BASE_PARAMS = {
    'date': None
}


class FormatUtils:
    """Common formatter util for dates, numbers, and strings.

    """

    @staticmethod
    def iso_meta(date, key):
        """Takes a date in various formats, formats to iso format,
        and returns a dict with the specified key

        :param date:
            some timestamp
        :str date: String format, ex. ISO
        :int date: UTC, ex. Milliseconds
        :date date: Date object
        :datetime date: Datetime Object
        :param str key:
            key for the returning dict

        :returns:
            dict containing

        """

        try:
            date = pendulum.parse(date.strip()).isoformat()
        except Exception as e:
            print('Date is not in a parsable format')

        return {key: date}

    @staticmethod
    def currency_meta(cur, key):
        """Takes currency string and formats to float

        :param str cur:
            currency string, i.e. '$1.0', '$-9.23'
        :param str key:
            key for the returning dict

        :returns:
            float formatted currency

        """

        num = None

        try:
            num = float(cur.strip().replace('$', ''))
        except ValueError:
            print('Current is not in correct format.')

        return {key: num}

    @staticmethod
    def expand_currency(cur):
        """Expands a raw currency string to expanded float

        :param str currency:
            currency string, i.e. '1M', '12B', '12k'

        :returns:
            expanded value based on symbol or 0 if invalid
            or symbol is inaccurate

        """

        if not cur:
            return None

        num = cur[:-1]
        notation = str(cur[-1]).lower()

        if notation == 'k':
            multiplier = 1000
        elif notation == 'm':
            multiplier = 1000000
        elif notation == 'b':
            multiplier = 1000000000
        else:
            multiplier = 0

        return float(num) * multiplier


class NASDAQEarnings(FormatUtils):
    """Basic mappings and extractions for NASDAQ Earnings Schedule.

    For raw markup structure, see:
        http://www.nasdaq.com/earnings/earnings-calendar.aspx?date=2016-Aug-04

    """

    COMPANY_RE = re.compile(r'(?P<name>.*?) \((?P<ticker>.*?)\) Market Cap: \$(?P<market_cap>.*?)$')

    markup_map = {
        '0': 'release_time',
        '1': 'company_info',
        '2': 'expected_report_date',
        '3': 'fiscal_quarter_ending',
        '4': 'avg_eps_forecast',
        '5': 'num_ests',
        '6': 'last_year_reporting_date',
        '7': 'last_years_eps'
    }

    @classmethod
    def release_time(cls, markup):
        """Parses our when earnings will be released based on the
        provided symbolic link, i.e. iconCls

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with release time information. ex.
            {"release_time": 'pre'}

            Allowed options are:
                - pre
                - post
                - uknown

        """

        release_time = 'unknown'

        anchor = markup.find('a')

        if anchor:
            symbol_link = anchor.get('href')

            if 'premarket' in symbol_link:
                release_time = 'pre'

            if 'after-hours' in symbol_link:
                release_time = 'post'

        return {'release_time': release_time}

    @classmethod
    def company_info(cls, markup):
        """Parses out company name, ticker, and market cap at the time of
        the query.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with name, ticker, market_cap, ex
            {
                "name': "ABC Corp.",
                "ticker': "ABCD",
                "market_cap": 123123123.1
            }

        """

        anchor = markup.find('a')
        company_meta = cls.COMPANY_RE.search(anchor.text)

        if company_meta:
            company_meta = company_meta.groupdict()
            expanded = cls.expand_currency(
                company_meta['market_cap']
            )

            company_meta.update({
                'market_cap': expanded
            })

        return company_meta

    @classmethod
    def expected_report_date(cls, markup):
        """Parses out the expected report date.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with expected earnings date, ex
            {"expected_report_date": '2016-07-12T15:22:29.005626-07:00'}

        """

        return cls.iso_meta(
            markup.text,
            'expected_report_date'
        )

    @classmethod
    def fiscal_quarter_ending(cls, markup):
        """Parses out the quarter end date.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with estimate information, ex
            {"fiscal_quarter_ending": '2016-07-12T15:22:29.005626-07:00'}

        """

        return cls.iso_meta(
            markup.text,
            'fiscal_quarter_ending'
        )

    @classmethod
    def avg_eps_forecast(cls, markup):
        """Parses out the avg EPS forecast.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with avg EPS forecast information, ex
            {"avg_eps_forecast": 2.1}

        """

        return cls.currency_meta(
            markup.text,
            'avg_eps_forecast'
        )

    @classmethod
    def num_ests(cls, markup):
        """Returns the total number of firms that made estimages
        publicly available.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with estimate information, ex
            {"num_est": 2}

        """

        return {'num_ests': int(markup.text)}

    @classmethod
    def last_year_reporting_date(cls, markup):
        """Parses out the last years reporting date.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with last years reporting date, ex
            {"last_year_reporting_date": '2016-07-12T15:22:29.005626-07:00'}

        """

        return cls.iso_meta(
            markup.text,
            'last_year_reporting_date'
        )

    @classmethod
    def last_years_eps(cls, markup):
        """Parses out the last years EPS.

        :param str markup:
            raw html table cell containing information

        :returns:
            dict with last years EPS, ex
            {"last_years_eps": 2.12}

        """

        return cls.currency_meta(
            markup.text,
            'last_years_eps'
        )


def format_date(day):
    """Take a date and attempts to format it to `2017-Jul-28`

    :param day:
        Target date for companies with pre and post market earnings
        releases.
    :str day: String format, ex. ISO
    :int day: UTC, ex. Milliseconds
    :date day: Date object
    :datetime day: Datetime Object

    :returns:
        str with formatted datetime in `%Y-%B-%d`

    """

    formatted = None

    try:
        formatted = pendulum.parse(day)

    except ValueError as e:
        raise e

    return formatted.format('%Y-%b-%d')


def get_page(day):
    """Get the raw HTML of the page containing earnings

    :param str day:
        formatted date string for target date

    :returns:
        str format of raw html

    """

    params = {
        **{'date': day},
        **BASE_PARAMS
    }
    page = None

    resp = requests.get(BASE_URL, params=params)

    if resp.ok:
        page = resp.text

    return page


def get_companies(page):
    """Take the raw html page as a string, parsers it out using
    BeautifulSoup, and extracts earning schedule for each company.

    :param str page:
        raw html page

    :returns:
        list basic company objects containing earnings schedule and
        basic est data

    """

    companies = []
    soup = BeautifulSoup(page, 'html.parser')

    tbl = soup.find('table', class_='USMN_EarningsCalendar')
    tbl_rows = tbl.find_all('tr')

    for row in tbl_rows[1:]:
        companies.append(extract_meta(row))

    return companies


def extract_meta(row):
    """Grabs the page mapping and extracts all posted company earnings
    information based on the map.

    :param str row:
        table row containing information a specific company

    :returns:
        dict containing extract information

        sample output:
            {
                "release_time": "pre",
                "name": "Zimmer Biomet Holdings, Inc.",
                "ticker": "ZBH",
                "market_cap": 25950000000.0,
                "expected_report_date": "2017-07-27T00:00:00+00:00",
                "fiscal_quarter_ending": "2017-06-27T00:00:00+00:00",
                "avg_eps_forecast": 2.1,
                "num_ests": 15,
                "last_year_reporting_date": "2016-07-28T00:00:00+00:00",
                "last_years_eps": 2.02
            }

    """

    company = {}

    cells = row.find_all('td')

    for k, v in NASDAQEarnings.markup_map.items():
        idx = int(k)
        info = getattr(NASDAQEarnings, v)(cells[idx])

        for k, v in info.items():
            company[k] = v

    return company


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Earnings Dates')
    parser.add_argument('-d', '--date', dest='earnings_date',
                        help='Enter a date to return companies ' +
                        'with earnings that day.')

    args = parser.parse_args()
    day = format_date(args.earnings_date)
    page = get_page(day)

    print(json.dumps(get_companies(page), indent=2))
	import argparse
	import json
	import pendulum
	import re
	import requests

	from bs4 import BeautifulSoup


	BASE_URL = 'http://www.nasdaq.com/earnings/earnings-calendar.aspx'
	BASE_PARAMS = {
	'date': None
	}


	class FormatUtils:
	"""Common formatter util for dates, numbers, and strings.

	"""

	@staticmethod
	def iso_meta(date, key):
	"""Takes a date in various formats, formats to iso format,
	and returns a dict with the specified key

	:param date:
	some timestamp
	:str date: String format, ex. ISO
	:int date: UTC, ex. Milliseconds
	:date date: Date object
	:datetime date: Datetime Object
	:param str key:
	key for the returning dict

	:returns:
	dict containing

	"""

	try:
	date = pendulum.parse(date.strip()).isoformat()
	except Exception as e:
	print('Date is not in a parsable format')

	return {key: date}

	@staticmethod
	def currency_meta(cur, key):
	"""Takes currency string and formats to float

	:param str cur:
	currency string, i.e. '$1.0', '$-9.23'
	:param str key:
	key for the returning dict

	:returns:
	float formatted currency

	"""

	num = None

	try:
	num = float(cur.strip().replace('$', ''))
	except ValueError:
	print('Current is not in correct format.')

	return {key: num}

	@staticmethod
	def expand_currency(cur):
	"""Expands a raw currency string to expanded float

	:param str currency:
	currency string, i.e. '1M', '12B', '12k'

	:returns:
	expanded value based on symbol or 0 if invalid
	or symbol is inaccurate

	"""

	if not cur:
	return None

	num = cur[:-1]
	notation = str(cur[-1]).lower()

	if notation == 'k':
	multiplier = 1000
	elif notation == 'm':
	multiplier = 1000000
	elif notation == 'b':
	multiplier = 1000000000
	else:
	multiplier = 0

	return float(num) * multiplier


	class NASDAQEarnings(FormatUtils):
	"""Basic mappings and extractions for NASDAQ Earnings Schedule.

	For raw markup structure, see:
	http://www.nasdaq.com/earnings/earnings-calendar.aspx?date=2016-Aug-04

	"""

	COMPANY_RE = re.compile(r'(?P<name>.?) \((?P<ticker>.?)\) Market Cap: \$(?P<market_cap>.*?)$')

	markup_map = {
	'0': 'release_time',
	'1': 'company_info',
	'2': 'expected_report_date',
	'3': 'fiscal_quarter_ending',
	'4': 'avg_eps_forecast',
	'5': 'num_ests',
	'6': 'last_year_reporting_date',
	'7': 'last_years_eps'
	}

	@classmethod
	def release_time(cls, markup):
	"""Parses our when earnings will be released based on the
	provided symbolic link, i.e. iconCls

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with release time information. ex.
	{"release_time": 'pre'}

	Allowed options are:
	- pre
	- post
	- uknown

	"""

	release_time = 'unknown'

	anchor = markup.find('a')

	if anchor:
	symbol_link = anchor.get('href')

	if 'premarket' in symbol_link:
	release_time = 'pre'

	if 'after-hours' in symbol_link:
	release_time = 'post'

	return {'release_time': release_time}

	@classmethod
	def company_info(cls, markup):
	"""Parses out company name, ticker, and market cap at the time of
	the query.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with name, ticker, market_cap, ex
	{
	"name': "ABC Corp.",
	"ticker': "ABCD",
	"market_cap": 123123123.1
	}

	"""

	anchor = markup.find('a')
	company_meta = cls.COMPANY_RE.search(anchor.text)

	if company_meta:
	company_meta = company_meta.groupdict()
	expanded = cls.expand_currency(
	company_meta['market_cap']
	)

	company_meta.update({
	'market_cap': expanded
	})

	return company_meta

	@classmethod
	def expected_report_date(cls, markup):
	"""Parses out the expected report date.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with expected earnings date, ex
	{"expected_report_date": '2016-07-12T15:22:29.005626-07:00'}

	"""

	return cls.iso_meta(
	markup.text,
	'expected_report_date'
	)

	@classmethod
	def fiscal_quarter_ending(cls, markup):
	"""Parses out the quarter end date.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with estimate information, ex
	{"fiscal_quarter_ending": '2016-07-12T15:22:29.005626-07:00'}

	"""

	return cls.iso_meta(
	markup.text,
	'fiscal_quarter_ending'
	)

	@classmethod
	def avg_eps_forecast(cls, markup):
	"""Parses out the avg EPS forecast.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with avg EPS forecast information, ex
	{"avg_eps_forecast": 2.1}

	"""

	return cls.currency_meta(
	markup.text,
	'avg_eps_forecast'
	)

	@classmethod
	def num_ests(cls, markup):
	"""Returns the total number of firms that made estimages
	publicly available.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with estimate information, ex
	{"num_est": 2}

	"""

	return {'num_ests': int(markup.text)}

	@classmethod
	def last_year_reporting_date(cls, markup):
	"""Parses out the last years reporting date.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with last years reporting date, ex
	{"last_year_reporting_date": '2016-07-12T15:22:29.005626-07:00'}

	"""

	return cls.iso_meta(
	markup.text,
	'last_year_reporting_date'
	)

	@classmethod
	def last_years_eps(cls, markup):
	"""Parses out the last years EPS.

	:param str markup:
	raw html table cell containing information

	:returns:
	dict with last years EPS, ex
	{"last_years_eps": 2.12}

	"""

	return cls.currency_meta(
	markup.text,
	'last_years_eps'
	)


	def format_date(day):
	"""Take a date and attempts to format it to `2017-Jul-28`

	:param day:
	Target date for companies with pre and post market earnings
	releases.
	:str day: String format, ex. ISO
	:int day: UTC, ex. Milliseconds
	:date day: Date object
	:datetime day: Datetime Object

	:returns:
	str with formatted datetime in `%Y-%B-%d`

	"""

	formatted = None

	try:
	formatted = pendulum.parse(day)

	except ValueError as e:
	raise e

	return formatted.format('%Y-%b-%d')


	def get_page(day):
	"""Get the raw HTML of the page containing earnings

	:param str day:
	formatted date string for target date

	:returns:
	str format of raw html

	"""

	params = {
	**{'date': day},
	**BASE_PARAMS
	}
	page = None

	resp = requests.get(BASE_URL, params=params)

	if resp.ok:
	page = resp.text

	return page


	def get_companies(page):
	"""Take the raw html page as a string, parsers it out using
	BeautifulSoup, and extracts earning schedule for each company.

	:param str page:
	raw html page

	:returns:
	list basic company objects containing earnings schedule and
	basic est data

	"""

	companies = []
	soup = BeautifulSoup(page, 'html.parser')

	tbl = soup.find('table', class_='USMN_EarningsCalendar')
	tbl_rows = tbl.find_all('tr')

	for row in tbl_rows[1:]:
	companies.append(extract_meta(row))

	return companies


	def extract_meta(row):
	"""Grabs the page mapping and extracts all posted company earnings
	information based on the map.

	:param str row:
	table row containing information a specific company

	:returns:
	dict containing extract information

	sample output:
	{
	"release_time": "pre",
	"name": "Zimmer Biomet Holdings, Inc.",
	"ticker": "ZBH",
	"market_cap": 25950000000.0,
	"expected_report_date": "2017-07-27T00:00:00+00:00",
	"fiscal_quarter_ending": "2017-06-27T00:00:00+00:00",
	"avg_eps_forecast": 2.1,
	"num_ests": 15,
	"last_year_reporting_date": "2016-07-28T00:00:00+00:00",
	"last_years_eps": 2.02
	}

	"""

	company = {}

	cells = row.find_all('td')

	for k, v in NASDAQEarnings.markup_map.items():
	idx = int(k)
	info = getattr(NASDAQEarnings, v)(cells[idx])

	for k, v in info.items():
	company[k] = v

	return company


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Earnings Dates')
	parser.add_argument('-d', '--date', dest='earnings_date',
	help='Enter a date to return companies ' +
	'with earnings that day.')

	args = parser.parse_args()
	day = format_date(args.earnings_date)
	page = get_page(day)

	print(json.dumps(get_companies(page), indent=2))