Skip to content

Instantly share code, notes, and snippets.

@Gelob
Last active December 14, 2015 02:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gelob/16aacab95d2d59887d86 to your computer and use it in GitHub Desktop.
Save Gelob/16aacab95d2d59887d86 to your computer and use it in GitHub Desktop.
pipeline.py with --bind-address for standalone users. You will need to place an IP address after --bind-address= on line 175. Example: "--bind-address=192.168.1.1",
# This file defines a seesaw pipeline for the ArchiveTeam Warrior.
# It can also be run standalone:
#
# pip install seesaw
# run-pipeline pipeline.py YOURNICKNAME
#
# (or run run-pipeline --help for more details)
#
import functools
import os
import os.path
import shutil
import time
from distutils.version import StrictVersion
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
# check the seesaw version before importing any other components
import seesaw
if StrictVersion(seesaw.__version__) < StrictVersion("0.0.12"):
raise Exception("This pipeline needs seesaw version 0.0.12 or higher.")
from seesaw.project import *
from seesaw.config import *
from seesaw.item import *
from seesaw.task import *
from seesaw.pipeline import *
from seesaw.externalprocess import *
from seesaw.tracker import *
from seesaw.util import find_executable
###########################################################################
# Find a useful Wget+Lua executable.
#
# WGET_LUA will be set to the first path that
# 1. does not crash with --version, and
# 2. prints the required version string
WGET_LUA = find_executable("Wget+Lua",
"GNU Wget 1.14.lua.20130120-8476",
[ "./wget-lua",
"./wget-lua-warrior",
"./wget-lua-local",
"../wget-lua",
"../../wget-lua",
"/home/warrior/wget-lua",
"/usr/bin/wget-lua" ])
if not WGET_LUA:
raise Exception("No usable Wget+Lua found.")
###########################################################################
# The user agent for external requests.
#
# Use this constant in the Wget command line.
USER_AGENT = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27"
###########################################################################
# The version number of this pipeline definition.
#
# Update this each time you make a non-cosmetic change.
# It will be added to the WARC files and reported to the tracker.
VERSION = "20130222.01"
###########################################################################
# This section defines project-specific tasks.
#
# Simple tasks (tasks that do not need any concurrency) are based on the
# SimpleTask class and have a process(item) method that is called for
# each item.
class PrepareDirectories(SimpleTask):
"""
A task that creates temporary directories and initializes filenames.
It initializes these directories, based on the previously set item_name:
item["item_dir"] = "%{data_dir}/%{item_name}"
item["warc_file_base"] = "%{warc_prefix}-%{item_name}-%{timestamp}"
These attributes are used in the following tasks, e.g., the Wget call.
* set warc_prefix to the project name.
* item["data_dir"] is set by the environment: it points to a working
directory reserved for this item.
* use item["item_dir"] for temporary files
"""
def __init__(self, warc_prefix):
SimpleTask.__init__(self, "PrepareDirectories")
self.warc_prefix = warc_prefix
def process(self, item):
item_name = item["item_name"]
dirname = "/".join(( item["data_dir"], item_name ))
if os.path.isdir(dirname):
shutil.rmtree(dirname)
os.makedirs(dirname)
item["item_dir"] = dirname
item["warc_file_base"] = "%s-%s-%s" % (self.warc_prefix, item_name, time.strftime("%Y%m%d-%H%M%S"))
open("%(item_dir)s/%(warc_file_base)s.warc.gz" % item, "w").close()
class MoveFiles(SimpleTask):
"""
After downloading, this task moves the warc file from the
item["item_dir"] directory to the item["data_dir"], and removes
the files in the item["item_dir"] directory.
"""
def __init__(self):
SimpleTask.__init__(self, "MoveFiles")
def process(self, item):
os.rename("%(item_dir)s/%(warc_file_base)s.warc.gz" % item,
"%(data_dir)s/%(warc_file_base)s.warc.gz" % item)
shutil.rmtree("%(item_dir)s" % item)
###########################################################################
# Initialize the project.
#
# This will be shown in the warrior management panel. The logo should not
# be too big. The deadline is optional.
project = Project(
title = "Posterous",
project_html = """
<img class="project-logo" alt="Posterous Logo" src="http://archiveteam.org/images/6/6c/Posterous_logo.png" />
<h2>Posterous.com <span class="links"><a href="http://www.posterous.com/">Website</a> &middot; <a
href="http://tracker.archiveteam.org/posterous/">Leaderboard</a></span></h2>
<p><i>Posterous</i> is closing April, 30th, 2013</p>
"""
, utc_deadline = datetime.datetime(2013,04,30, 23,59,0)
)
###########################################################################
# The ID of the tracker for this warrior (used in URLs below).
TRACKER_ID = "posterous"
###########################################################################
# The pipeline.
#
# Items move through each task on the pipeline.
# Items are dicts, so tasks can set properties and can use properties set
# by earlier tasks and (such as the item["item_name"] property).
#
pipeline = Pipeline(
# request an item from the tracker (using the universal-tracker protocol)
# the downloader variable will be set by the warrior environment
#
# this task will wait for an item and sets item["item_name"] to the item name
# before finishing
GetItemFromTracker("http://tracker.archiveteam.org/%s" % TRACKER_ID, downloader, VERSION),
# create the directories and initialize the filenames (see above)
# warc_prefix is the first part of the warc filename
#
# this task will set item["item_dir"] and item["warc_file_base"]
PrepareDirectories(warc_prefix="posterous.com"),
# execute Wget+Lua
#
# the ItemInterpolation() objects are resolved during runtime
# (when there is an Item with values that can be added to the strings)
WgetDownload([ WGET_LUA,
"-U", USER_AGENT,
"-nv",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--bind-address=",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--page-requisites",
"--span-hosts",
"--domains", ItemInterpolation("%(item_name)s,s3.amazonaws.com,files.posterous.com,getfile.posterous.com,getfile0.posterous.com,getfile1.posterous.com,getfile2.posterous.com,getfile3.posterous.com,getfile4.posterous.com,getfile5.posterous.com,getfile6.posterous.com,getfile7.posterous.com,getfile8.posterous.com,getfile9.posterous.com,getfile10.posterous.com"),
"--timeout", "60",
"--tries", "20",
"--waitretry", "5",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "posterous-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("posterous-user: %(item_name)s"),
ItemInterpolation("https://%(item_name)s/")
],
max_tries = 2,
# check this: which Wget exit codes count as a success?
accept_on_exit_code = [ 0, 4, 6, 8 ],
),
# this will set the item["stats"] string that is sent to the tracker (see below)
PrepareStatsForTracker(
# there are a few normal values that need to be sent
defaults = { "downloader": downloader, "version": VERSION },
# this is used for the size counter on the tracker:
# the groups should correspond with the groups set configured on the tracker
file_groups = {
# there can be multiple groups with multiple files
# file sizes are measured per group
"data": [ ItemInterpolation("%(item_dir)s/%(warc_file_base)s.warc.gz") ]
}
),
# remove the temporary files, move the warc file from
# item["item_dir"] to item["data_dir"]
MoveFiles(),
# there can be multiple items in the pipeline, but this wrapper ensures
# that there is only one item uploading at a time
#
# the NumberConfigValue can be changed in the configuration panel
LimitConcurrent(NumberConfigValue(min=1, max=4, default="1", name="shared:rsync_threads", title="Rsync threads",
description="The maximum number of concurrent uploads."),
# this upload task asks the tracker for an upload target
# this can be HTTP or rsync and can be changed in the tracker admin panel
UploadWithTracker(
"http://tracker.archiveteam.org/%s" % TRACKER_ID,
downloader = downloader,
version = VERSION,
# list the files that should be uploaded.
# this may include directory names.
# note: HTTP uploads will only upload the first file on this list
files = [
ItemInterpolation("%(data_dir)s/%(warc_file_base)s.warc.gz")
],
# the relative path for the rsync command
# (this defines if the files are uploaded to a subdirectory on the server)
rsync_target_source_path = ItemInterpolation("%(data_dir)s/"),
# extra rsync parameters (probably standard)
rsync_extra_args = [
"--recursive",
"--partial",
"--partial-dir", ".rsync-tmp"
]
),
),
# if the item passed every task, notify the tracker and report the statistics
SendDoneToTracker(
tracker_url = "http://tracker.archiveteam.org/%s" % TRACKER_ID,
stats = ItemValue("stats")
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment