Skip to content

Instantly share code, notes, and snippets.

Created October 2, 2011 12:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/1257408 to your computer and use it in GitHub Desktop.
Save anonymous/1257408 to your computer and use it in GitHub Desktop.
pydsnap
filter_nontrading_stocks = True
scraper = DsebdIntradayScraper(raw_html)
quotes = scraper.execute(filter_nontrading_stocks)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# dse_intraday.py
#
# This file is part of the MaxTrader Project. http://www.maxtraderbd.com/
#
# Copyright (c) 2011 invarBrass <NO_SPAM@maxtraderbd.com>
# Portions Copyright (c) 2010, M. Nasimul Haque
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the MaxTrader Software Services nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $Author: NO_SPAM@maxtraderbd.com $
# $Id: dse_intraday.py 22 2011-10-01 15:33:48Z NO_SPAM@maxtraderbd.com $
# $Rev: 22 $
import re, string
from BeautifulSoup import BeautifulSoup
from pipeline import *
TICKER, LTP, HIGH, LOW, CLOSE, YCP, CHANGE, TRADE, VOLUME = range(1, 10)
class DseScraperContext(PipelineContext):
def __init__(self, raw_data):
self.raw_data = raw_data
self.sanitized_data = raw_data
self.filter_nontraded_stocks = True
def reset(self):
self.nontraded_stocks = 0
self.raw_quotes = []
self.validated_quotes = []
self.table_rows = []
self.soup = None
self.column_indices = {}
for i in range(TICKER, VOLUME + 1):
self.column_indices[i] = -1
class HtmlSanitizationStage(Stage):
_tidy_patterns = {
'<body>': re.compile(r'<body[^>]*>', re.I),
'<tr>': re.compile(r'<tr[^>]*>', re.I),
'<td>': re.compile(r'<td[^>]*>', re.I),
'<a href="#">': re.compile(r'<a\s+href=[^>]*>', re.I),
'<font>': re.compile(r'<font[^>]*>', re.I),
'<!-- # -->': re.compile(r'<link[^>]*>', re.I),
' ': re.compile(r'[\r\n]')
}
def execute(self, context):
html = context.raw_data
for tag, rx in self._tidy_patterns.items():
html = rx.sub(tag, html)
context.sanitized_data = html
#html = "".join([s for s in html.splitlines() if s.strip()])
#tidy = lambda s: string.join(filter(string.strip, re.split(r'[\r\n]+', s)), '\n')
#html = tidy(html)
class SoupifyStage(Stage):
def execute(self, context):
context.reset()
context.soup = BeautifulSoup(context.sanitized_data)
context.table_rows = context.soup.body.table.findAll('tr')
class FieldsDiscoveryStage(Stage):
_column_patterns = {
TICKER: re.compile(r'\bcode\b', re.I),
LTP: re.compile(r'\bltp\b', re.I),
HIGH: re.compile(r'\bhigh\b', re.I),
LOW: re.compile(r'\blow\b', re.I),
CLOSE: re.compile(r'\bclose\b', re.I),
YCP: re.compile(r'\bycp\b', re.I),
CHANGE: re.compile(r'\bchange\b', re.I),
TRADE: re.compile(r'\btrade\b', re.I),
VOLUME: re.compile(r'\bvolume\b', re.I)
}
def execute(self, context):
header_row = context.table_rows[0]
cells = header_row.findAll('b') #.findAll('b')
for col_ix, cell in enumerate(cells):
vv = str(cell.contents[0])
text = cell.find(text=True)
for i in range(TICKER, VOLUME + 1):
if self._column_patterns[i].search(text) != None:
context.column_indices[i] = col_ix + 1
break
class QuotesScrapingStage(Stage):
def execute(self, context):
rows = context.table_rows[1:]
for row in rows:
cells = row.findAll('td') #[1:]
quote = {}
for field, index in context.column_indices.items():
if field == TICKER:
quote[field] = cells[index].a.contents[0].strip()
else:
quote[field] = str(cells[index].find(text=True)).strip()
context.raw_quotes.append(quote)
class QuotesValidationStage(Stage):
def _sanitize(self, quote):
result = {}
for i in range(TICKER, VOLUME + 1):
if i == TICKER:
result[i] = quote[i].upper()
elif i >= LTP and i <= CHANGE:
value = float(quote[i])
if i != CHANGE:
value = value if value >= 0 else 0
result[i] = value
else:
value = int(quote[i])
result[i] = value if value >= 0 else 0
return result
def execute(self, context):
validated_quotes = []
for quote in context.raw_quotes:
quote = self._sanitize(quote)
if (quote[TRADE] == 0) and context.filter_nontraded_stocks:
context.nontraded_stocks += 1
else:
validated_quotes.append(quote)
context.validated_quotes = validated_quotes
class DsebdIntradayScraper(object):
__pipeline = None
def __init__(self):
self.__pipeline = SequentialPipeline()
self.__pipeline.add_stage(HtmlSanitizationStage())
self.__pipeline.add_stage(SoupifyStage())
self.__pipeline.add_stage(FieldsDiscoveryStage())
self.__pipeline.add_stage(QuotesScrapingStage())
self.__pipeline.add_stage(QuotesValidationStage())
def execute(self, html, filter_nontraded_stocks = False):
context = DseScraperContext(html)
context.filter_nontraded_stocks = filter_nontraded_stocks
self.__pipeline.execute(context)
if context.has_errors():
raise Exception(context.error_messages())
return context.validated_quotes
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# pipeline.py
#
# This file is part of the MaxTrader Project. http://www.maxtraderbd.com/
#
# Copyright (c) 2011 invarBrass <NO_SPAM@maxtraderbd.com>
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the MaxTrader Software Services nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# $Author: NO_SPAM@maxtraderbd.com $
# $Id: pipeline.py 22 2011-10-01 15:33:48Z NO_SPAM@maxtraderbd.com $
# $Rev: 22 $
class PipelineContext(object):
_errors = []
def get_errors(self):
return self._errors
def has_errors(self):
return True if len(self._errors) > 0 else False
def add_error(self, err):
self._errors.append(err)
def error_messages(self):
return '\r\n'.join(self._errors)
class Stage(object):
def execute(self, context): abstract
class Pipeline(Stage):
def add_stage(self, stage): abstract
class SequentialPipeline(Pipeline):
_stages = []
def add_stage(self, stage):
self._stages.append(stage)
def execute(self, context):
for stage in self._stages:
stage.execute(context)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment