Last active
August 12, 2020 08:27
-
-
Save reidsanders/abee7371e90f7b589ec212dcc4cfe1b2 to your computer and use it in GitHub Desktop.
Basic script to create a code dataset by downloading repos from github
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Basic script to create a code dataset by downloading code from github. | |
Takes url representing language and repository search, then goes through the pages, downloading the repositories and extracting the code files, then just concatenates them and splits into train and val (not really randomized -- a bit complicated in this). | |
Has several hardcoded assumptions. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
import subprocess | |
import os | |
import time | |
import fire | |
BASE_URL = "https://github.com" | |
def get_page(url): | |
""" | |
Utilty function used to get a Beautiful Soup object from a given URL | |
""" | |
session = requests.Session() | |
headers = { | |
'User-Agent': | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36', | |
'Accept': | |
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | |
} | |
try: | |
req = session.get(url, headers=headers) | |
except requests.exceptions.RequestException: | |
return None | |
bs = BeautifulSoup(req.text, 'html.parser') | |
return bs | |
def scrape_repo_list(first_url, pages=1): | |
links = [] | |
i = 1 | |
while True: | |
url = first_url + f'&p={i}' | |
bs = get_page(url) | |
new_links = scrape_github(bs) | |
links += new_links | |
i += 1 | |
if i > pages: | |
break | |
return links | |
def scrape_github(bs): | |
links = [] | |
for item in bs.find_all(class_='repo-list-item'): | |
link = item.find('a').get('href') | |
if link: | |
links.append(link) | |
return links | |
def download_repos(links, repos_dir="out/"): | |
if not os.path.isdir(repos_dir): | |
os.mkdir(repos_dir) | |
for link in links: | |
git_link = BASE_URL + link + ".git" | |
print(f"Downloading repository from {git_link}") | |
result = subprocess.run( | |
["git", "clone", git_link, "--single-branch", "--depth", "1"], | |
cwd=repos_dir | |
) | |
time.sleep(1) | |
return result | |
def extract_code_files(directory, out_dir="data/"): | |
full_text = "" | |
for root, dirs, files in os.walk(directory): | |
for fi in files: | |
# TODO remove hardcoded extension | |
if fi.endswith(".py"): | |
filepath = os.path.join(root, fi) | |
try: | |
with open(filepath) as f: | |
full_text += f.read() | |
full_text += "\n\n<END_FILE>\n\n" | |
except Exception as detail: | |
print(detail) | |
#import ipdb; ipdb.set_trace() | |
if not os.path.isdir(out_dir): | |
os.mkdir(out_dir) | |
with open(os.path.join(out_dir,"val.txt"), "wb") as f: | |
f.write(full_text[:len(full_text) // 5].encode("utf-8").strip()) | |
with open(os.path.join(out_dir,"train.txt"), "wb") as f: | |
f.write(full_text[len(full_text) // 5:].encode("utf-8").strip()) | |
def main( | |
repos_dir: str = 'repos/', | |
skip_download: bool = False, | |
pages: int = 5, | |
text_dir: str = 'data', | |
url: | |
str = 'https://github.com/search?l=&q=stars%3A%3E5+extension%3Apy+language%3APython&type=Repositories' | |
): | |
print(f"Skip download: {skip_download}") | |
if not skip_download: | |
links = scrape_repo_list(url, pages=pages) | |
print(f"Downloading: {links}") | |
download_repos(links, repos_dir=repos_dir) | |
extract_code_files(repos_dir, out_dir=text_dir) | |
if __name__ == "__main__": | |
fire.Fire(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment