Created
July 15, 2014 13:56
-
-
Save DomWilliams0/250bee4042f1d1256f5a to your computer and use it in GitHub Desktop.
Extracts all individual words from the tenjava 2014 repositories to a single file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import re | |
import os | |
import glob | |
def fetch_repos(outputpath): | |
print("fetching repo names from github") | |
pages = list(range(1, 10)) | |
pat = re.compile("/tenjava/(.+\-t[1-3])\"") | |
repos = [] | |
for i in pages: | |
repos.extend(re.findall(pat, read_page("https://github.com/tenjava/repositories?page=%d" % i).decode())) | |
print("scraped repo page %d/%d" % (i, pages[-1])) | |
repos = sorted(set(repos)) | |
file = open(outputpath, "w") | |
for repo in repos: | |
print(repo, file=file) | |
file.close() | |
def load_names(repofilepath): | |
names = [] | |
for line in open(repofilepath): | |
names.append(line.strip()) | |
return names | |
def download_repos(names, zipdir): | |
print("downloading all repos - this may take a while") | |
if not os.path.exists(zipdir): | |
os.mkdir(zipdir) | |
for n in names: | |
try: | |
html = read_page("https://github.com/tenjava/%s/archive/master.zip" % n) | |
except: | |
print("could not download", n) | |
continue | |
download(html, "%s%s.zip" % (zipdir, n)) | |
def download(data, path): | |
f = open(path, "wb") | |
f.write(data) | |
print("downloaded", path) | |
def read_page(url): | |
response = urllib.request.urlopen(url) | |
return response.read() | |
def gather_files(dumpdir, zipdir): | |
print("extracting all repos to the dump") | |
import zipfile | |
if os.path.exists(dumpdir): | |
os.system("rm -rf " + dumpdir) # delete previous dump | |
os.mkdir(dumpdir) | |
for f in glob.glob(zipdir + "/*.zip"): | |
z = zipfile.ZipFile(f) | |
dirname = os.path.split(f)[-1][:-4] | |
z.extractall(zipdir + "/" + dirname) | |
absdir = os.path.join(os.getcwd(), zipdir, dirname) | |
for root, dirs, filenames in os.walk(absdir): | |
for file in filenames: | |
if not file.endswith(".java"): | |
continue | |
path = root + "/" + file | |
target = os.path.join(os.getcwd(), dumpdir, dirname[:-3] + "-" + file) | |
os.system("cp %s %s" % (path, target)) | |
def process_files(outputpath, dumpdir, sort): | |
print("extracting all words") | |
outfile = open(outputpath, "w") | |
for file in enumerate(glob.glob(dumpdir + "*.java")): | |
f = open(file[1]) | |
contents = re.findall(r"[\w']+", f.read()) # split | |
for word in contents: | |
processed = re.sub(r'\W+', "", word).strip() | |
if len(processed) > 1: | |
print(processed, file=outfile) | |
outfile.close() | |
if sort: | |
os.system(u"cat {0} | sort > {1}".format(outputpath, outputpath + ".sorted")) | |
def filter_words(wordpath, outputpath, amount): | |
print("filtering out rare words") | |
count = {} | |
outfile = open(outputpath, "w") | |
for word in open(wordpath): | |
if word not in count: | |
count[word] = 1 | |
else: | |
count[word] += 1 | |
savecount = 0 | |
for word in count: | |
w = word.strip() | |
if count[word] >= amount: | |
savecount += 1 | |
for _ in list(range(count[word])): print(w, file=outfile) | |
print("filtered", savecount, "words") | |
outfile.close() | |
if __name__ == '__main__': | |
maindir = "tenjava" | |
if not os.path.isdir(maindir): | |
os.mkdir(maindir) | |
os.chdir(maindir) | |
repofile = "repos.txt" | |
dumpdir = "dump/" | |
zipdir = "zips/" | |
outputfile = "words.txt" | |
filteredwordsfile = "filteredwords.txt" | |
fetch_repos(repofile) | |
download_repos(load_names(repofile), zipdir) | |
gather_files(dumpdir, zipdir) | |
process_files(outputfile, dumpdir, True) | |
filter_words(outputfile, filteredwordsfile, 200) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment