danvk/collect_repos.py

## collect_repos.py
def split_interval(a, b):
    d = int((b - a) / 2)
    return [(a, a + d), (a + d + 1, b)]


def split_by_days(stars, day_start, day_end):
    start_fmt = day_start.strftime('%Y-%m-%d')
    end_fmt = day_end.strftime('%Y-%m-%d')
    q = query_for_star_dates(stars, start_fmt, end_fmt)
    c = get_count(q)
    if c <= 1000:
        out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json'
        print(f'query: {q}')
        scrape(q, out_file)
    else:
        days = (day_end - day_start).days
        if days == 0:
            raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}')
        for a, b in split_interval(0, days):
            dt_a = day_start + timedelta(days=a)
            dt_b = day_start + timedelta(days=b)
            split_by_days(stars, dt_a, dt_b)


def scrape_range_days():
    # Scrape from a low star range up, splitting by creation date (which never changes).
    ranges = [
      (15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 50),
      (51, 60), (61, 70), (71, 80), (81, 90), (91, 100),
      (100, 119), (120, 139), (140, 159), (160, 179), (180, 200),
      (201, 225), (226, 250), (251, 300), (301, 400), (401, 500),
      (501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000),
      (1001, 1500), (1501, 5000), (5001, 1_000_000)
    ]
    for a, b in ranges:
        stars = f'{a}..{b}'
        split_by_days(stars, datetime(2007, 1, 1), datetime(2020, 2, 2))
	def split_interval(a, b):
	d = int((b - a) / 2)
	return [(a, a + d), (a + d + 1, b)]


	def split_by_days(stars, day_start, day_end):
	start_fmt = day_start.strftime('%Y-%m-%d')
	end_fmt = day_end.strftime('%Y-%m-%d')
	q = query_for_star_dates(stars, start_fmt, end_fmt)
	c = get_count(q)
	if c <= 1000:
	out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json'
	print(f'query: {q}')
	scrape(q, out_file)
	else:
	days = (day_end - day_start).days
	if days == 0:
	raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}')
	for a, b in split_interval(0, days):
	dt_a = day_start + timedelta(days=a)
	dt_b = day_start + timedelta(days=b)
	split_by_days(stars, dt_a, dt_b)


	def scrape_range_days():
	# Scrape from a low star range up, splitting by creation date (which never changes).
	ranges = [
	(15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 50),
	(51, 60), (61, 70), (71, 80), (81, 90), (91, 100),
	(100, 119), (120, 139), (140, 159), (160, 179), (180, 200),
	(201, 225), (226, 250), (251, 300), (301, 400), (401, 500),
	(501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000),
	(1001, 1500), (1501, 5000), (5001, 1_000_000)
	]
	for a, b in ranges:
	stars = f'{a}..{b}'
	split_by_days(stars, datetime(2007, 1, 1), datetime(2020, 2, 2))