Skip to content

Instantly share code, notes, and snippets.

@DomWilliams0
Created July 15, 2014 13:56
Show Gist options
  • Save DomWilliams0/250bee4042f1d1256f5a to your computer and use it in GitHub Desktop.
Save DomWilliams0/250bee4042f1d1256f5a to your computer and use it in GitHub Desktop.
Extracts all individual words from the tenjava 2014 repositories to a single file
import urllib.request
import re
import os
import glob
def fetch_repos(outputpath):
print("fetching repo names from github")
pages = list(range(1, 10))
pat = re.compile("/tenjava/(.+\-t[1-3])\"")
repos = []
for i in pages:
repos.extend(re.findall(pat, read_page("https://github.com/tenjava/repositories?page=%d" % i).decode()))
print("scraped repo page %d/%d" % (i, pages[-1]))
repos = sorted(set(repos))
file = open(outputpath, "w")
for repo in repos:
print(repo, file=file)
file.close()
def load_names(repofilepath):
names = []
for line in open(repofilepath):
names.append(line.strip())
return names
def download_repos(names, zipdir):
print("downloading all repos - this may take a while")
if not os.path.exists(zipdir):
os.mkdir(zipdir)
for n in names:
try:
html = read_page("https://github.com/tenjava/%s/archive/master.zip" % n)
except:
print("could not download", n)
continue
download(html, "%s%s.zip" % (zipdir, n))
def download(data, path):
f = open(path, "wb")
f.write(data)
print("downloaded", path)
def read_page(url):
response = urllib.request.urlopen(url)
return response.read()
def gather_files(dumpdir, zipdir):
print("extracting all repos to the dump")
import zipfile
if os.path.exists(dumpdir):
os.system("rm -rf " + dumpdir) # delete previous dump
os.mkdir(dumpdir)
for f in glob.glob(zipdir + "/*.zip"):
z = zipfile.ZipFile(f)
dirname = os.path.split(f)[-1][:-4]
z.extractall(zipdir + "/" + dirname)
absdir = os.path.join(os.getcwd(), zipdir, dirname)
for root, dirs, filenames in os.walk(absdir):
for file in filenames:
if not file.endswith(".java"):
continue
path = root + "/" + file
target = os.path.join(os.getcwd(), dumpdir, dirname[:-3] + "-" + file)
os.system("cp %s %s" % (path, target))
def process_files(outputpath, dumpdir, sort):
print("extracting all words")
outfile = open(outputpath, "w")
for file in enumerate(glob.glob(dumpdir + "*.java")):
f = open(file[1])
contents = re.findall(r"[\w']+", f.read()) # split
for word in contents:
processed = re.sub(r'\W+', "", word).strip()
if len(processed) > 1:
print(processed, file=outfile)
outfile.close()
if sort:
os.system(u"cat {0} | sort > {1}".format(outputpath, outputpath + ".sorted"))
def filter_words(wordpath, outputpath, amount):
print("filtering out rare words")
count = {}
outfile = open(outputpath, "w")
for word in open(wordpath):
if word not in count:
count[word] = 1
else:
count[word] += 1
savecount = 0
for word in count:
w = word.strip()
if count[word] >= amount:
savecount += 1
for _ in list(range(count[word])): print(w, file=outfile)
print("filtered", savecount, "words")
outfile.close()
if __name__ == '__main__':
maindir = "tenjava"
if not os.path.isdir(maindir):
os.mkdir(maindir)
os.chdir(maindir)
repofile = "repos.txt"
dumpdir = "dump/"
zipdir = "zips/"
outputfile = "words.txt"
filteredwordsfile = "filteredwords.txt"
fetch_repos(repofile)
download_repos(load_names(repofile), zipdir)
gather_files(dumpdir, zipdir)
process_files(outputfile, dumpdir, True)
filter_words(outputfile, filteredwordsfile, 200)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment