Skip to content

Instantly share code, notes, and snippets.

@hannahwhy
Last active August 29, 2015 14:02
Show Gist options
  • Save hannahwhy/1bdca9cc4235416a3786 to your computer and use it in GitHub Desktop.
Save hannahwhy/1bdca9cc4235416a3786 to your computer and use it in GitHub Desktop.
diff --git a/doc/terse_options.rst b/doc/terse_options.rst
index 725ae72..b223979 100644
--- a/doc/terse_options.rst
+++ b/doc/terse_options.rst
@@ -35,6 +35,7 @@ Brief Option Overview
[--random-file FILE] [--edg-file FILE]
[--warc-file FILENAME] [--warc-append]
[--warc-header STRING] [--warc-max-size NUMBER]
+ [--move-warc-to DIR]
[--warc-cdx] [--warc-dedup FILE] [--no-warc-compression]
[--no-warc-digests] [--no-warc-keep-log]
[--warc-tempdir DIRECTORY] [-r] [-l NUMBER]
@@ -196,6 +197,8 @@ Brief Option Overview
--warc-header STRING include STRING in WARC file metadata
--warc-max-size NUMBER
write sequential WARC files sized about NUMBER bytes
+ --move-warc-to DIR once a sequential WARC file has reached its max size,
+ move it to DIR
--warc-cdx write CDX file along with the WARC file
--warc-dedup FILE write revisit records using digests in FILE
--no-warc-compression
diff --git a/wpull/options.py b/wpull/options.py
index 57af72b..f75483d 100644
--- a/wpull/options.py
+++ b/wpull/options.py
@@ -891,6 +891,12 @@ class AppArgumentParser(argparse.ArgumentParser):
help=_('write sequential WARC files sized about NUMBER bytes')
)
group.add_argument(
+ '--move-warc-to',
+ metavar='DIRECTORY',
+ default=None,
+ help=_('once a sequential WARC file has reached its max size, move it to DIRECTORY')
+ )
+ group.add_argument(
'--warc-cdx',
action='store_true',
help=_('write CDX file along with the WARC file')
diff --git a/wpull/recorder.py b/wpull/recorder.py
index 88a9991..c8bcdf5 100644
--- a/wpull/recorder.py
+++ b/wpull/recorder.py
@@ -11,6 +11,7 @@ import logging
import os.path
import re
import sys
+import shutil
from tempfile import NamedTemporaryFile
import tempfile
import time
@@ -140,6 +141,7 @@ WARCRecorderParams = namedlist.namedtuple(
('digests', True),
('cdx', None),
('max_size', None),
+ ('move_to', None),
('url_table', None),
('software_string', None)
]
@@ -157,6 +159,8 @@ Args:
cdx (bool): If True, a CDX file will be written.
max_size (int): If provided, output files are named like
``name-00000.ext`` and the log file will be in ``name-meta.ext``.
+ move_to (str): If provided, completed sequential WARCs will be moved
+ to the given directory
url_table (:class:`.database.URLTable`): If given, then ``revist``
records will be written.
software_string (str): The value for the ``software`` field in the
@@ -287,6 +291,13 @@ class WARCRecorder(BaseRecorder):
_logger.debug('Starting new warc file due to max size.')
self._start_new_warc_file()
+ if self._params.move_to is not None:
+ if os.path.isdir(self._params.move_to):
+ shutil.move(self._warc_filename, self.params.move_to)
+ else:
+ _logger.error('%s is not a directory; not moving %s.' % (
+ self._params.move_to, self._warc_filename))
+
def set_length_and_maybe_checksums(self, record, payload_offset=None):
'''Set the content length and possibly the checksums.'''
if self._params.digests:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment