DomWilliams0/tenjava.py

## tenjava.py
import urllib.request
import re
import os
import glob


def fetch_repos(outputpath):
    print("fetching repo names from github")
    pages = list(range(1, 10))
    pat = re.compile("/tenjava/(.+\-t[1-3])\"")

    repos = []

    for i in pages:
        repos.extend(re.findall(pat, read_page("https://github.com/tenjava/repositories?page=%d" % i).decode()))
        print("scraped repo page %d/%d" % (i, pages[-1]))

    repos = sorted(set(repos))
    file = open(outputpath, "w")
    for repo in repos:
        print(repo, file=file)
    file.close()


def load_names(repofilepath):
    names = []
    for line in open(repofilepath):
        names.append(line.strip())
    return names


def download_repos(names, zipdir):
    print("downloading all repos - this may take a while")
    if not os.path.exists(zipdir):
        os.mkdir(zipdir)

    for n in names:
        try:
            html = read_page("https://github.com/tenjava/%s/archive/master.zip" % n)
        except:
            print("could not download", n)
            continue
        download(html, "%s%s.zip" % (zipdir, n))


def download(data, path):
    f = open(path, "wb")
    f.write(data)
    print("downloaded", path)


def read_page(url):
    response = urllib.request.urlopen(url)
    return response.read()


def gather_files(dumpdir, zipdir):
    print("extracting all repos to the dump")
    import zipfile

    if os.path.exists(dumpdir):
        os.system("rm -rf " + dumpdir)  # delete previous dump
    os.mkdir(dumpdir)

    for f in glob.glob(zipdir + "/*.zip"):
        z = zipfile.ZipFile(f)
        dirname = os.path.split(f)[-1][:-4]
        z.extractall(zipdir + "/" + dirname)

        absdir = os.path.join(os.getcwd(), zipdir, dirname)

        for root, dirs, filenames in os.walk(absdir):
            for file in filenames:
                if not file.endswith(".java"):
                    continue
                path = root + "/" + file
                target = os.path.join(os.getcwd(), dumpdir, dirname[:-3] + "-" + file)
                os.system("cp %s %s" % (path, target))


def process_files(outputpath, dumpdir, sort):
    print("extracting all words")
    outfile = open(outputpath, "w")
    for file in enumerate(glob.glob(dumpdir + "*.java")):
        f = open(file[1])
        contents = re.findall(r"[\w']+", f.read())  # split
        for word in contents:
            processed = re.sub(r'\W+', "", word).strip()
            if len(processed) > 1:
                print(processed, file=outfile)
    outfile.close()

    if sort:
        os.system(u"cat {0} | sort > {1}".format(outputpath, outputpath + ".sorted"))


def filter_words(wordpath, outputpath, amount):
    print("filtering out rare words")
    count = {}
    outfile = open(outputpath, "w")
    for word in open(wordpath):
        if word not in count:
            count[word] = 1
        else:
            count[word] += 1
    savecount = 0
    for word in count:
        w = word.strip()
        if count[word] >= amount:
            savecount += 1
            for _ in list(range(count[word])): print(w, file=outfile)
    print("filtered", savecount, "words")
    outfile.close()


if __name__ == '__main__':
    maindir = "tenjava"

    if not os.path.isdir(maindir):
        os.mkdir(maindir)
    os.chdir(maindir)

    repofile = "repos.txt"
    dumpdir = "dump/"
    zipdir = "zips/"
    outputfile = "words.txt"
    filteredwordsfile = "filteredwords.txt"

    fetch_repos(repofile)
    download_repos(load_names(repofile), zipdir)
    gather_files(dumpdir, zipdir)
    process_files(outputfile, dumpdir, True)
    filter_words(outputfile, filteredwordsfile, 200)
	import urllib.request
	import re
	import os
	import glob


	def fetch_repos(outputpath):
	print("fetching repo names from github")
	pages = list(range(1, 10))
	pat = re.compile("/tenjava/(.+\-t[1-3])\"")

	repos = []

	for i in pages:
	repos.extend(re.findall(pat, read_page("https://github.com/tenjava/repositories?page=%d" % i).decode()))
	print("scraped repo page %d/%d" % (i, pages[-1]))

	repos = sorted(set(repos))
	file = open(outputpath, "w")
	for repo in repos:
	print(repo, file=file)
	file.close()


	def load_names(repofilepath):
	names = []
	for line in open(repofilepath):
	names.append(line.strip())
	return names


	def download_repos(names, zipdir):
	print("downloading all repos - this may take a while")
	if not os.path.exists(zipdir):
	os.mkdir(zipdir)

	for n in names:
	try:
	html = read_page("https://github.com/tenjava/%s/archive/master.zip" % n)
	except:
	print("could not download", n)
	continue
	download(html, "%s%s.zip" % (zipdir, n))


	def download(data, path):
	f = open(path, "wb")
	f.write(data)
	print("downloaded", path)


	def read_page(url):
	response = urllib.request.urlopen(url)
	return response.read()


	def gather_files(dumpdir, zipdir):
	print("extracting all repos to the dump")
	import zipfile

	if os.path.exists(dumpdir):
	os.system("rm -rf " + dumpdir) # delete previous dump
	os.mkdir(dumpdir)

	for f in glob.glob(zipdir + "/*.zip"):
	z = zipfile.ZipFile(f)
	dirname = os.path.split(f)[-1][:-4]
	z.extractall(zipdir + "/" + dirname)

	absdir = os.path.join(os.getcwd(), zipdir, dirname)

	for root, dirs, filenames in os.walk(absdir):
	for file in filenames:
	if not file.endswith(".java"):
	continue
	path = root + "/" + file
	target = os.path.join(os.getcwd(), dumpdir, dirname[:-3] + "-" + file)
	os.system("cp %s %s" % (path, target))


	def process_files(outputpath, dumpdir, sort):
	print("extracting all words")
	outfile = open(outputpath, "w")
	for file in enumerate(glob.glob(dumpdir + "*.java")):
	f = open(file[1])
	contents = re.findall(r"[\w']+", f.read()) # split
	for word in contents:
	processed = re.sub(r'\W+', "", word).strip()
	if len(processed) > 1:
	print(processed, file=outfile)
	outfile.close()

	if sort:
	os.system(u"cat {0} \| sort > {1}".format(outputpath, outputpath + ".sorted"))


	def filter_words(wordpath, outputpath, amount):
	print("filtering out rare words")
	count = {}
	outfile = open(outputpath, "w")
	for word in open(wordpath):
	if word not in count:
	count[word] = 1
	else:
	count[word] += 1
	savecount = 0
	for word in count:
	w = word.strip()
	if count[word] >= amount:
	savecount += 1
	for _ in list(range(count[word])): print(w, file=outfile)
	print("filtered", savecount, "words")
	outfile.close()


	if __name__ == '__main__':
	maindir = "tenjava"

	if not os.path.isdir(maindir):
	os.mkdir(maindir)
	os.chdir(maindir)

	repofile = "repos.txt"
	dumpdir = "dump/"
	zipdir = "zips/"
	outputfile = "words.txt"
	filteredwordsfile = "filteredwords.txt"

	fetch_repos(repofile)
	download_repos(load_names(repofile), zipdir)
	gather_files(dumpdir, zipdir)
	process_files(outputfile, dumpdir, True)
	filter_words(outputfile, filteredwordsfile, 200)