jaysridhar/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Download All Attachments from Trac

This is a simple script for downloading all attachements from all tickets in Trac. I looked around and found nothing to fit my needs so here is a script I whipped up. It uses BeautifulSoup4 to crawl the pages, and requests to fetch the content.
Installation

$ virtualenv venv
$ . venv/bin/activate
$ pip install -r requirements.txt
Usage

$ python extract.py <link to trac site> -exec
The -exec is required to create directories and files in the current directory.
Thats all there is to it.

  
## extract.py
#!/usr/bin/python

import sys, argparse, requests, os, datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from tqdm import tqdm

parser = argparse.ArgumentParser(description="Download attachments from Trac")
parser.add_argument('-savedir', help='Save into this directory')
parser.add_argument('-exec', action='store_true',
                    help='Perform the extraction, default is to just show what will be done.')
parser.add_argument('baseurl', help='Use this base URL')

args = parser.parse_args()
print(args)

baseurl = args.baseurl
pr = urlparse(baseurl)
scheme = pr.scheme
netloc = pr.netloc
if args.savedir:
    basedir = args.savedir
else:
    basedir = '_'.join(os.path.split(urlparse(baseurl).path)[1:])
if os.path.exists(basedir):
    print(f'save directory "{basedir}" exists ... move it away')
    sys.exit(0)
if args.exec: os.makedirs(basedir)
resp = requests.get(f'{baseurl}/wiki')
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
tickets = soup.select('tbody.trac-query-results > tr')
for ticket in tqdm(tickets):
    rel_url = ticket.select('td.id > a')[0]['href']
    title = ticket.select('td.summary > a')[0].string
    tid = os.path.split(rel_url)[-1]
    dirname = os.path.join(basedir, f'{tid}_{title}')
    if args.exec:
        os.makedirs(dirname)
    resp = requests.get(f'{scheme}://{netloc}{rel_url}')
    resp.raise_for_status()
    tsoup = BeautifulSoup(resp.text, 'html.parser')
    tline = tsoup.select('div#ticketbox a.timeline')[0]['title']
    ticket_timestr = tline.replace('See timeline at ', '')
    tdate = datetime.datetime.strptime(ticket_timestr, '%m/%d/%Y %I:%M:%S %p')
    if args.exec:
        os.utime(dirname, times=(tdate.timestamp(), tdate.timestamp()))
    attachments = tsoup.select('dl.attachments > dt')
    for attachment in attachments:
        att_title = attachment.select('a')[0].string
        att_url = attachment.select('a')[1]['href']
        filename = os.path.join(dirname, att_title)
        tline = attachment.select('a.timeline')[0]['title']
        att_timestr = tline.replace('See timeline at ', '')
        att_date = datetime.datetime.strptime(att_timestr, '%m/%d/%Y %I:%M:%S %p')
        resp = requests.get(f'{scheme}://{netloc}{att_url}', stream=True)
        resp.raise_for_status()
        if args.exec:
            with open(filename, 'wb') as fd:
                for chunk in resp.iter_content(chunk_size=4096):
                    fd.write(chunk)
            os.utime(filename, times=(att_date.timestamp(), att_date.timestamp()))

## requirements.txt
beautifulsoup4
requests
tqdm
	#!/usr/bin/python

	import sys, argparse, requests, os, datetime
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	from tqdm import tqdm

	parser = argparse.ArgumentParser(description="Download attachments from Trac")
	parser.add_argument('-savedir', help='Save into this directory')
	parser.add_argument('-exec', action='store_true',
	help='Perform the extraction, default is to just show what will be done.')
	parser.add_argument('baseurl', help='Use this base URL')

	args = parser.parse_args()
	print(args)

	baseurl = args.baseurl
	pr = urlparse(baseurl)
	scheme = pr.scheme
	netloc = pr.netloc
	if args.savedir:
	basedir = args.savedir
	else:
	basedir = '_'.join(os.path.split(urlparse(baseurl).path)[1:])
	if os.path.exists(basedir):
	print(f'save directory "{basedir}" exists ... move it away')
	sys.exit(0)
	if args.exec: os.makedirs(basedir)
	resp = requests.get(f'{baseurl}/wiki')
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, 'html.parser')
	tickets = soup.select('tbody.trac-query-results > tr')
	for ticket in tqdm(tickets):
	rel_url = ticket.select('td.id > a')[0]['href']
	title = ticket.select('td.summary > a')[0].string
	tid = os.path.split(rel_url)[-1]
	dirname = os.path.join(basedir, f'{tid}_{title}')
	if args.exec:
	os.makedirs(dirname)
	resp = requests.get(f'{scheme}://{netloc}{rel_url}')
	resp.raise_for_status()
	tsoup = BeautifulSoup(resp.text, 'html.parser')
	tline = tsoup.select('div#ticketbox a.timeline')[0]['title']
	ticket_timestr = tline.replace('See timeline at ', '')
	tdate = datetime.datetime.strptime(ticket_timestr, '%m/%d/%Y %I:%M:%S %p')
	if args.exec:
	os.utime(dirname, times=(tdate.timestamp(), tdate.timestamp()))
	attachments = tsoup.select('dl.attachments > dt')
	for attachment in attachments:
	att_title = attachment.select('a')[0].string
	att_url = attachment.select('a')[1]['href']
	filename = os.path.join(dirname, att_title)
	tline = attachment.select('a.timeline')[0]['title']
	att_timestr = tline.replace('See timeline at ', '')
	att_date = datetime.datetime.strptime(att_timestr, '%m/%d/%Y %I:%M:%S %p')
	resp = requests.get(f'{scheme}://{netloc}{att_url}', stream=True)
	resp.raise_for_status()
	if args.exec:
	with open(filename, 'wb') as fd:
	for chunk in resp.iter_content(chunk_size=4096):
	fd.write(chunk)
	os.utime(filename, times=(att_date.timestamp(), att_date.timestamp()))