Skip to content

Instantly share code, notes, and snippets.

@jaysridhar
Last active September 26, 2022 11:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jaysridhar/c99915e397c5b26b9c844d3043f4bf0c to your computer and use it in GitHub Desktop.
Save jaysridhar/c99915e397c5b26b9c844d3043f4bf0c to your computer and use it in GitHub Desktop.
Download all attachments from a Trac site

Download All Attachments from Trac

This is a simple script for downloading all attachements from all tickets in Trac. I looked around and found nothing to fit my needs so here is a script I whipped up. It uses BeautifulSoup4 to crawl the pages, and requests to fetch the content.

Installation

$ virtualenv venv
$ . venv/bin/activate
$ pip install -r requirements.txt

Usage

$ python extract.py <link to trac site> -exec

The -exec is required to create directories and files in the current directory.

Thats all there is to it.

#!/usr/bin/python
import sys, argparse, requests, os, datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from tqdm import tqdm
parser = argparse.ArgumentParser(description="Download attachments from Trac")
parser.add_argument('-savedir', help='Save into this directory')
parser.add_argument('-exec', action='store_true',
help='Perform the extraction, default is to just show what will be done.')
parser.add_argument('baseurl', help='Use this base URL')
args = parser.parse_args()
print(args)
baseurl = args.baseurl
pr = urlparse(baseurl)
scheme = pr.scheme
netloc = pr.netloc
if args.savedir:
basedir = args.savedir
else:
basedir = '_'.join(os.path.split(urlparse(baseurl).path)[1:])
if os.path.exists(basedir):
print(f'save directory "{basedir}" exists ... move it away')
sys.exit(0)
if args.exec: os.makedirs(basedir)
resp = requests.get(f'{baseurl}/wiki')
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
tickets = soup.select('tbody.trac-query-results > tr')
for ticket in tqdm(tickets):
rel_url = ticket.select('td.id > a')[0]['href']
title = ticket.select('td.summary > a')[0].string
tid = os.path.split(rel_url)[-1]
dirname = os.path.join(basedir, f'{tid}_{title}')
if args.exec:
os.makedirs(dirname)
resp = requests.get(f'{scheme}://{netloc}{rel_url}')
resp.raise_for_status()
tsoup = BeautifulSoup(resp.text, 'html.parser')
tline = tsoup.select('div#ticketbox a.timeline')[0]['title']
ticket_timestr = tline.replace('See timeline at ', '')
tdate = datetime.datetime.strptime(ticket_timestr, '%m/%d/%Y %I:%M:%S %p')
if args.exec:
os.utime(dirname, times=(tdate.timestamp(), tdate.timestamp()))
attachments = tsoup.select('dl.attachments > dt')
for attachment in attachments:
att_title = attachment.select('a')[0].string
att_url = attachment.select('a')[1]['href']
filename = os.path.join(dirname, att_title)
tline = attachment.select('a.timeline')[0]['title']
att_timestr = tline.replace('See timeline at ', '')
att_date = datetime.datetime.strptime(att_timestr, '%m/%d/%Y %I:%M:%S %p')
resp = requests.get(f'{scheme}://{netloc}{att_url}', stream=True)
resp.raise_for_status()
if args.exec:
with open(filename, 'wb') as fd:
for chunk in resp.iter_content(chunk_size=4096):
fd.write(chunk)
os.utime(filename, times=(att_date.timestamp(), att_date.timestamp()))
beautifulsoup4
requests
tqdm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment