reidsanders/code_scrape.py

## code_scrape.py
#!python3
# -*- coding: utf-8 -*-

"""
Basic script to create a code dataset by downloading code from github.

Takes url representing language and repository search, then goes through the pages, downloading the repositories and extracting the code files, then just concatenates them and splits into train and val (not really randomized -- a bit complicated in this).

Has several hardcoded assumptions.
"""

import requests
from bs4 import BeautifulSoup
import subprocess
import os
import time
import fire

BASE_URL = "https://github.com"


def get_page(url):
    """
    Utilty function used to get a Beautiful Soup object from a given URL
    """

    session = requests.Session()
    headers = {
        'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
        'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    }
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs


def scrape_repo_list(first_url, pages=1):
    links = []
    i = 1
    while True:
        url = first_url + f'&p={i}'
        bs = get_page(url)
        new_links = scrape_github(bs)
        links += new_links
        i += 1
        if i > pages:
            break
    return links


def scrape_github(bs):
    links = []
    for item in bs.find_all(class_='repo-list-item'):
        link = item.find('a').get('href')
        if link:
            links.append(link)
    return links


def download_repos(links, repos_dir="out/"):
    if not os.path.isdir(repos_dir):
        os.mkdir(repos_dir)
    for link in links:
        git_link = BASE_URL + link + ".git"
        print(f"Downloading repository from {git_link}")
        result = subprocess.run(
            ["git", "clone", git_link, "--single-branch", "--depth", "1"],
            cwd=repos_dir
        )
        time.sleep(1)
    return result


def extract_code_files(directory, out_dir="data/"):
    full_text = ""
    for root, dirs, files in os.walk(directory):
        for fi in files:
            # TODO remove hardcoded extension
            if fi.endswith(".py"):
                filepath = os.path.join(root, fi)
                try:
                    with open(filepath) as f:
                        full_text += f.read()
                        full_text += "\n\n<END_FILE>\n\n"
                except Exception as detail:
                    print(detail)
    #import ipdb; ipdb.set_trace()
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir)
    with open(os.path.join(out_dir,"val.txt"), "wb") as f:
        f.write(full_text[:len(full_text) // 5].encode("utf-8").strip())
    with open(os.path.join(out_dir,"train.txt"), "wb") as f:
        f.write(full_text[len(full_text) // 5:].encode("utf-8").strip())


def main(
    repos_dir: str = 'repos/',
    skip_download: bool = False,
    pages: int = 5,
    text_dir: str = 'data',
    url:
    str = 'https://github.com/search?l=&q=stars%3A%3E5+extension%3Apy+language%3APython&type=Repositories'
):
    print(f"Skip download: {skip_download}")
    if not skip_download:
        links = scrape_repo_list(url, pages=pages)
        print(f"Downloading: {links}")
        download_repos(links, repos_dir=repos_dir)
    extract_code_files(repos_dir, out_dir=text_dir)


if __name__ == "__main__":
    fire.Fire(main)
	#!python3
	# -- coding: utf-8 --

	"""
	Basic script to create a code dataset by downloading code from github.

	Takes url representing language and repository search, then goes through the pages, downloading the repositories and extracting the code files, then just concatenates them and splits into train and val (not really randomized -- a bit complicated in this).

	Has several hardcoded assumptions.
	"""

	import requests
	from bs4 import BeautifulSoup
	import subprocess
	import os
	import time
	import fire

	BASE_URL = "https://github.com"


	def get_page(url):
	"""
	Utilty function used to get a Beautiful Soup object from a given URL
	"""

	session = requests.Session()
	headers = {
	'User-Agent':
	'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
	'Accept':
	'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
	}
	try:
	req = session.get(url, headers=headers)
	except requests.exceptions.RequestException:
	return None
	bs = BeautifulSoup(req.text, 'html.parser')
	return bs


	def scrape_repo_list(first_url, pages=1):
	links = []
	i = 1
	while True:
	url = first_url + f'&p={i}'
	bs = get_page(url)
	new_links = scrape_github(bs)
	links += new_links
	i += 1
	if i > pages:
	break
	return links


	def scrape_github(bs):
	links = []
	for item in bs.find_all(class_='repo-list-item'):
	link = item.find('a').get('href')
	if link:
	links.append(link)
	return links


	def download_repos(links, repos_dir="out/"):
	if not os.path.isdir(repos_dir):
	os.mkdir(repos_dir)
	for link in links:
	git_link = BASE_URL + link + ".git"
	print(f"Downloading repository from {git_link}")
	result = subprocess.run(
	["git", "clone", git_link, "--single-branch", "--depth", "1"],
	cwd=repos_dir
	)
	time.sleep(1)
	return result


	def extract_code_files(directory, out_dir="data/"):
	full_text = ""
	for root, dirs, files in os.walk(directory):
	for fi in files:
	# TODO remove hardcoded extension
	if fi.endswith(".py"):
	filepath = os.path.join(root, fi)
	try:
	with open(filepath) as f:
	full_text += f.read()
	full_text += "\n\n<END_FILE>\n\n"
	except Exception as detail:
	print(detail)
	#import ipdb; ipdb.set_trace()
	if not os.path.isdir(out_dir):
	os.mkdir(out_dir)
	with open(os.path.join(out_dir,"val.txt"), "wb") as f:
	f.write(full_text[:len(full_text) // 5].encode("utf-8").strip())
	with open(os.path.join(out_dir,"train.txt"), "wb") as f:
	f.write(full_text[len(full_text) // 5:].encode("utf-8").strip())


	def main(
	repos_dir: str = 'repos/',
	skip_download: bool = False,
	pages: int = 5,
	text_dir: str = 'data',
	url:
	str = 'https://github.com/search?l=&q=stars%3A%3E5+extension%3Apy+language%3APython&type=Repositories'
	):
	print(f"Skip download: {skip_download}")
	if not skip_download:
	links = scrape_repo_list(url, pages=pages)
	print(f"Downloading: {links}")
	download_repos(links, repos_dir=repos_dir)
	extract_code_files(repos_dir, out_dir=text_dir)


	if __name__ == "__main__":
	fire.Fire(main)