Skip to content

Instantly share code, notes, and snippets.

@PeskyPotato
Last active October 13, 2020 07:57
Show Gist options
  • Save PeskyPotato/8fd46d080b2e6e2f9544b9ae5e2f2422 to your computer and use it in GitHub Desktop.
Save PeskyPotato/8fd46d080b2e6e2f9544b9ae5e2f2422 to your computer and use it in GitHub Desktop.
Save all GitHub user stars to JSON
from time import sleep
from urllib.request import urlopen, Request, urlretrieve
import requests
from bs4 import BeautifulSoup as soup
import re
import sys
import json
stars_list = []
count = 0
def scrape(user, after=""):
global count
root = "https://github.com"
if after == "":
url = "https://github.com/{}?tab=stars".format(user)
else:
url = "https://github.com/{}?after={}&tab=stars".format(user,after)
req = Request(
url,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}
)
page_soup = soup(urlopen(req).read(), "lxml")
for star in page_soup.find_all("div", {"class":"col-12 d-block width-full py-4 border-bottom"}):
star_d = {}
star_d["name"] = star.find("h3").a.text.strip()
star_d["repo_url"] = root + star.find("h3").a["href"]
star_d["description"] = ""
if star.find("p", {"itemprop":"description"}):
star_d["description"] = star.find("p", {"itemprop":"description"}).text.strip()
info = star.find_all("a", {"class":"muted-link mr-3"})
star_d["stars"] = info[0].text.strip()
star_d["forks"] = ""
if len(info) > 1:
star_d["forks"] = info[1].text.strip()
star_d["updated"] = star.find("relative-time").text.strip()
stars_list.append(star_d)
count += 1
print(count, star_d["repo_url"])
btn = page_soup.find_all("a", {"class":"btn btn-outline BtnGroup-item"})
after = re.search("(?<=after=)(.)*(?=&)", btn[0]["href"])
try:
if after:
after = after.group()
scrape(user, after)
else:
after = re.search("(?<=after=)(.)*(?=&)", btn[1]["href"]).group()
except IndexError:
print("complete")
sys.exit()
sleep(1)
with open("{}-stars.json".format(user), "w+") as f:
json.dump(stars_list, f)
sleep(20)
scrape(user, after)
scrape("LameLemon")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment