Created
February 9, 2020 17:25
-
-
Save danvk/6d059b0b7e50280789dbc9c41e5dd317 to your computer and use it in GitHub Desktop.
Collect all the repos with >=15 stars using a recursive partitioning scheme
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def split_interval(a, b): | |
d = int((b - a) / 2) | |
return [(a, a + d), (a + d + 1, b)] | |
def split_by_days(stars, day_start, day_end): | |
start_fmt = day_start.strftime('%Y-%m-%d') | |
end_fmt = day_end.strftime('%Y-%m-%d') | |
q = query_for_star_dates(stars, start_fmt, end_fmt) | |
c = get_count(q) | |
if c <= 1000: | |
out_file = f'repos.star={stars}.{start_fmt}-{end_fmt}.json' | |
print(f'query: {q}') | |
scrape(q, out_file) | |
else: | |
days = (day_end - day_start).days | |
if days == 0: | |
raise ValueError(f'Can\'t split any more: {stars} / {day_start} .. {day_end}') | |
for a, b in split_interval(0, days): | |
dt_a = day_start + timedelta(days=a) | |
dt_b = day_start + timedelta(days=b) | |
split_by_days(stars, dt_a, dt_b) | |
def scrape_range_days(): | |
# Scrape from a low star range up, splitting by creation date (which never changes). | |
ranges = [ | |
(15, 20), (21, 25), (26, 30), (31, 35), (36, 40), (41, 45), (46, 50), | |
(51, 60), (61, 70), (71, 80), (81, 90), (91, 100), | |
(100, 119), (120, 139), (140, 159), (160, 179), (180, 200), | |
(201, 225), (226, 250), (251, 300), (301, 400), (401, 500), | |
(501, 700), (701, 1000), (1001, 1500), (1501, 5000), (5001, 1_000_000), | |
(1001, 1500), (1501, 5000), (5001, 1_000_000) | |
] | |
for a, b in ranges: | |
stars = f'{a}..{b}' | |
split_by_days(stars, datetime(2007, 1, 1), datetime(2020, 2, 2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment