Skip to content

Instantly share code, notes, and snippets.

@danvk
Created February 9, 2020 17:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danvk/6d059b0b7e50280789dbc9c41e5dd317 to your computer and use it in GitHub Desktop.
Save danvk/6d059b0b7e50280789dbc9c41e5dd317 to your computer and use it in GitHub Desktop.
Collect all the repos with >=15 stars using a recursive partitioning scheme
def split_interval(a, b):
d = int((b - a) / 2)
return [(a, a + d), (a + d + 1, b)]
def split_by_days(stars, day_start, day_end):
start_fmt = day_start.strftime('%Y-%m-%d')
end_fmt = day_end.strftime('%Y-%m-%d')
q = query_for_star_dates(stars, start_fmt, end_fmt)
c = get_count(q)
if c <= 1000:
out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json'
print(f'query: {q}')
scrape(q, out_file)
else:
days = (day_end - day_start).days
if days == 0:
raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}')
for a, b in split_interval(0, days):
dt_a = day_start + timedelta(days=a)
dt_b = day_start + timedelta(days=b)
split_by_days(stars, dt_a, dt_b)
def scrape_range_days():
# Scrape from a low star range up, splitting by creation date (which never changes).
ranges = [
(15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 50),
(51, 60), (61, 70), (71, 80), (81, 90), (91, 100),
(100, 119), (120, 139), (140, 159), (160, 179), (180, 200),
(201, 225), (226, 250), (251, 300), (301, 400), (401, 500),
(501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000),
(1001, 1500), (1501, 5000), (5001, 1_000_000)
]
for a, b in ranges:
stars = f'{a}..{b}'
split_by_days(stars, datetime(2007, 1, 1), datetime(2020, 2, 2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment