jinhwanlazy/poloniex_crawler.py

## poloniex_crawler.py
import httplib2
import pandas as pd
import simplejson as json
from datetime import datetime
from datetime import timedelta


def get_trade_history_(currency_pair='USDT_BTC', start=1494000000, end=1600000000):
    """
    Simply query trade history in given range
    """
    url = 'https://poloniex.com/public?command=returnTradeHistory&currencyPair={}&start={}&end={}'.format(currency_pair, start, end)
    http = httplib2.Http()
    response, content = http.request(url, 'GET')
    res = pd.DataFrame(json.loads(content))
    return res


def get_trade_history(currency_pair='USDT_BTC'):
    """
    Sequencialy collect whole trade history.
    It also cache the data in file.
    TODO. parallel access?
    """
    def timestamp(date):
        dt = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        tz = timedelta(hours=9)
        return (dt + tz).timestamp()

    def nxt_range(data):
        interval = timedelta(days=24).total_seconds()
        now = datetime.now().timestamp()
        if data.empty:
            return now-interval, now
        max_t = timestamp(data.date.max())
        min_t = timestamp(data.date.min())
        if now - max_t > timedelta(hours=1).total_seconds():
            return max_t-10, now
        if 1 not in data.tradeID.values:
            return min_t-interval, min_t+10
        return None, None

    def crawl(data, start, end):
        new = get_trade_history_(currency_pair, start, end)
        data = pd.concat([data, new]).drop_duplicates()
        data = data.sort_values(by='tradeID').reset_index(drop=True)
        return data

    cache = "poloniex_{}.pkl.gz".format(currency_pair)
    try:
        data = pd.read_pickle(cache)
    except:
        data = pd.DataFrame()
    print(data)

    start, end = nxt_range(data)
    while start is not None:
        print('seek', start, end)
        data = crawl(data, start, end)
        print('now', data.tradeID.min(), data.tradeID.max())
        start, end = nxt_range(data)
    fill_gaps(data)
    data.to_pickle(cache)
    return data


def fill_gaps(df):
    """
    Finds any non continuous data, and fill the gaps
    Not implemented yet.
    """
    ts = list(df.loc[df.tradeID - df.tradeID.shift() != 1].date)[1:]
    print(ts)
    if not ts:
        return


if __name__ == "__main__":
    df = get_trade_history("USDT_BTC")
    print(df)
	import httplib2
	import pandas as pd
	import simplejson as json
	from datetime import datetime
	from datetime import timedelta


	def get_trade_history_(currency_pair='USDT_BTC', start=1494000000, end=1600000000):
	"""
	Simply query trade history in given range
	"""
	url = 'https://poloniex.com/public?command=returnTradeHistory&currencyPair={}&start={}&end={}'.format(currency_pair, start, end)
	http = httplib2.Http()
	response, content = http.request(url, 'GET')
	res = pd.DataFrame(json.loads(content))
	return res


	def get_trade_history(currency_pair='USDT_BTC'):
	"""
	Sequencialy collect whole trade history.
	It also cache the data in file.
	TODO. parallel access?
	"""
	def timestamp(date):
	dt = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
	tz = timedelta(hours=9)
	return (dt + tz).timestamp()

	def nxt_range(data):
	interval = timedelta(days=24).total_seconds()
	now = datetime.now().timestamp()
	if data.empty:
	return now-interval, now
	max_t = timestamp(data.date.max())
	min_t = timestamp(data.date.min())
	if now - max_t > timedelta(hours=1).total_seconds():
	return max_t-10, now
	if 1 not in data.tradeID.values:
	return min_t-interval, min_t+10
	return None, None

	def crawl(data, start, end):
	new = get_trade_history_(currency_pair, start, end)
	data = pd.concat([data, new]).drop_duplicates()
	data = data.sort_values(by='tradeID').reset_index(drop=True)
	return data

	cache = "poloniex_{}.pkl.gz".format(currency_pair)
	try:
	data = pd.read_pickle(cache)
	except:
	data = pd.DataFrame()
	print(data)

	start, end = nxt_range(data)
	while start is not None:
	print('seek', start, end)
	data = crawl(data, start, end)
	print('now', data.tradeID.min(), data.tradeID.max())
	start, end = nxt_range(data)
	fill_gaps(data)
	data.to_pickle(cache)
	return data


	def fill_gaps(df):
	"""
	Finds any non continuous data, and fill the gaps
	Not implemented yet.
	"""
	ts = list(df.loc[df.tradeID - df.tradeID.shift() != 1].date)[1:]
	print(ts)
	if not ts:
	return


	if __name__ == "__main__":
	df = get_trade_history("USDT_BTC")
	print(df)