Skip to content

Instantly share code, notes, and snippets.

@reidsanders
Last active August 12, 2020 08:27
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save reidsanders/abee7371e90f7b589ec212dcc4cfe1b2 to your computer and use it in GitHub Desktop.
Save reidsanders/abee7371e90f7b589ec212dcc4cfe1b2 to your computer and use it in GitHub Desktop.
Basic script to create a code dataset by downloading repos from github
#!python3
# -*- coding: utf-8 -*-
"""
Basic script to create a code dataset by downloading code from github.
Takes url representing language and repository search, then goes through the pages, downloading the repositories and extracting the code files, then just concatenates them and splits into train and val (not really randomized -- a bit complicated in this).
Has several hardcoded assumptions.
"""
import requests
from bs4 import BeautifulSoup
import subprocess
import os
import time
import fire
BASE_URL = "https://github.com"
def get_page(url):
"""
Utilty function used to get a Beautiful Soup object from a given URL
"""
session = requests.Session()
headers = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
try:
req = session.get(url, headers=headers)
except requests.exceptions.RequestException:
return None
bs = BeautifulSoup(req.text, 'html.parser')
return bs
def scrape_repo_list(first_url, pages=1):
links = []
i = 1
while True:
url = first_url + f'&p={i}'
bs = get_page(url)
new_links = scrape_github(bs)
links += new_links
i += 1
if i > pages:
break
return links
def scrape_github(bs):
links = []
for item in bs.find_all(class_='repo-list-item'):
link = item.find('a').get('href')
if link:
links.append(link)
return links
def download_repos(links, repos_dir="out/"):
if not os.path.isdir(repos_dir):
os.mkdir(repos_dir)
for link in links:
git_link = BASE_URL + link + ".git"
print(f"Downloading repository from {git_link}")
result = subprocess.run(
["git", "clone", git_link, "--single-branch", "--depth", "1"],
cwd=repos_dir
)
time.sleep(1)
return result
def extract_code_files(directory, out_dir="data/"):
full_text = ""
for root, dirs, files in os.walk(directory):
for fi in files:
# TODO remove hardcoded extension
if fi.endswith(".py"):
filepath = os.path.join(root, fi)
try:
with open(filepath) as f:
full_text += f.read()
full_text += "\n\n<END_FILE>\n\n"
except Exception as detail:
print(detail)
#import ipdb; ipdb.set_trace()
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
with open(os.path.join(out_dir,"val.txt"), "wb") as f:
f.write(full_text[:len(full_text) // 5].encode("utf-8").strip())
with open(os.path.join(out_dir,"train.txt"), "wb") as f:
f.write(full_text[len(full_text) // 5:].encode("utf-8").strip())
def main(
repos_dir: str = 'repos/',
skip_download: bool = False,
pages: int = 5,
text_dir: str = 'data',
url:
str = 'https://github.com/search?l=&q=stars%3A%3E5+extension%3Apy+language%3APython&type=Repositories'
):
print(f"Skip download: {skip_download}")
if not skip_download:
links = scrape_repo_list(url, pages=pages)
print(f"Downloading: {links}")
download_repos(links, repos_dir=repos_dir)
extract_code_files(repos_dir, out_dir=text_dir)
if __name__ == "__main__":
fire.Fire(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment