Skip to content

Instantly share code, notes, and snippets.

@droogie
Last active May 26, 2018 07:12
Show Gist options
  • Save droogie/021c3a08539730c61c53c75074de635b to your computer and use it in GitHub Desktop.
Save droogie/021c3a08539730c61c53c75074de635b to your computer and use it in GitHub Desktop.
quick and dirty opengrok project crawl+downlaod
#my ghetto soluton until this feature is implemented, https://github.com/oracle/opengrok/issues/2000
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import os
import argparse
parser = argparse.ArgumentParser()
requiredNamed = parser.add_argument_group("required named arguments")
requiredNamed.add_argument("-u","--url", type=str, help="The base URL of the project to crawl (https://opengrok.xyz.com/source/xref/project/(...))", required=True)
requiredNamed.add_argument("-s","--session", type=str, help="Session Cookie (JSESSIONID=AAAA...)", required=True)
args = parser.parse_args()
SESSION = args.session
BASEURL = args.url
BASEDIR = "./"
def fetchURL(url, session):
opener = urllib2.build_opener()
opener.addheaders.append(('Cookie', session))
html_page = opener.open(url)
soup = BeautifulSoup(html_page)
return soup
def fetchRAW(url, session):
url = url.replace("xref", "raw")
opener = urllib2.build_opener()
opener.addheaders.append(('Cookie', session))
html_page = opener.open(url)
return html_page.read()
def createDir(dir):
print "Creating directory: " + dir
if not os.path.exists(dir):
os.makedirs(dir)
def downloadFile(url, file):
print "Downloading file: " + file
res = fetchRAW(url, SESSION)
f = open(file, "w")
f.write(res)
f.close()
def crawl(subdir):
url = BASEURL + subdir
print "Crawling: \n" + url
soup = fetchURL(url, SESSION)
directories = []
files = []
if subdir:
createDir(BASEDIR + subdir)
# get all directories
print "Directories:"
for link in soup.findAll('a', attrs={'class': re.compile("r")}):
print link.get('href')
directories.append(link.get('href'))
# get all files
print "\nFiles:"
for link in soup.findAll('a', attrs={'class': re.compile("p")}):
print link.get('href')
files.append(link.get('href'))
for i in files:
downloadFile(url + i, BASEDIR + subdir + i)
for i in directories:
crawl(subdir + i)
createDir(BASEURL.split('/')[-2:][0])
os.chdir(BASEURL.split('/')[-2:][0])
crawl("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment