Skip to content

Instantly share code, notes, and snippets.

@thcipriani
Last active July 20, 2017 20:27
Show Gist options
  • Save thcipriani/bd59e25a5db7c3551bcf5d6041a9a0c3 to your computer and use it in GitHub Desktop.
Save thcipriani/bd59e25a5db7c3551bcf5d6041a9a0c3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# coding: utf-8
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# This script automates creating a list of individual files.
#
# As input, this script takes a search list generated via wget following
# the steps here:
#
# https://blog.archive.org/2012/04/26/downloading-in-bulk-using-wget/
#
# Usage is like:
#
# ./archive-org-pdf.py < search.csv > 'galaxymagazinepdfs.txt'
import os
import sys
import requests
from pyquery import PyQuery as pq
BASE_URL = 'https://archive.org/download'
URLS = sys.stdin.read()
playlist = []
for url in URLS.splitlines():
if url == '"identifier"':
continue
url = url.strip().strip('"')
if not url.startswith(BASE_URL):
url = os.path.join(BASE_URL, url)
r = requests.get(url)
r.raise_for_status()
page = pq(r.text)
pres = page('pre a')
for i in range(0, len(pres)):
link = pres.eq(i).text()
if not link.endswith('.pdf'):
continue
playlist.append(url + '/' + link)
sys.stdout.write('\n'.join(playlist))
#!/usr/bin/env bash
archive-pdfs.py < search.csv > galaxypdfs.txt
parallel -j $(cat /proc/cpuinfo | grep -c processor) -a galaxypdfs.txt wget
https://archive.org/download/Galaxy_v03n03_1951-12/Galaxy_v03n03_1951-12.pdf
https://archive.org/download/Galaxy_v05n03_1952-12/Galaxy_v05n03_1952-12.pdf
https://archive.org/download/Galaxy_v28n01_1969-02/Galaxy_v28n01_1969-02.pdf
https://archive.org/download/Galaxy_v25n04_1967-04/Galaxy_v25n04_1967-04.pdf
https://archive.org/download/Galaxy_v23n05_1965-06/Galaxy_v23n05_1965-06.pdf
https://archive.org/download/Galaxy_v22n02_1963-12/Galaxy_v22n02_1963-12.pdf
https://archive.org/download/Galaxy_v30n01_1970-04/Galaxy_v30n01_1970-04.pdf
https://archive.org/download/Galaxy_v24n06_1966-08/Galaxy_v24n06_1966-08.pdf
https://archive.org/download/Galaxy_v33n08_1973-09/Galaxy_v33n08_1973-09.pdf
https://archive.org/download/Galaxy_v31n01_1970-12/Galaxy_v31n01_1970-12.pdf
https://archive.org/download/Galaxy_v35n05_1974-05/Galaxy_v35n05_1974-05.pdf
https://archive.org/download/Galaxy_v31n03_1971-02/Galaxy_v31n03_1971-02.pdf
https://archive.org/download/Galaxy_v31n06_1971-05-06/Galaxy_v31n06_1971-05-06.pdf
https://archive.org/download/Galaxy_v37n04_1976-05/Galaxy_v37n04_1976-05.pdf
https://archive.org/download/galaxymagazine-1952-06/Galaxy_1952_06.pdf
https://archive.org/download/galaxymagazine-1952-06/Galaxy_1952_06_text.pdf
https://archive.org/download/galaxymagazine-1959-06/Galaxy_1959_06.pdf
https://archive.org/download/galaxymagazine-1959-06/Galaxy_1959_06_text.pdf
https://archive.org/download/galaxymagazine-1954-09/Galaxy_1954_09.pdf
https://archive.org/download/galaxymagazine-1954-09/Galaxy_1954_09_text.pdf
https://archive.org/download/galaxymagazine-1951-09/Galaxy_1951_09.pdf
https://archive.org/download/galaxymagazine-1951-09/Galaxy_1951_09_text.pdf
https://archive.org/download/galaxymagazine-1954-11/Galaxy_1954_11.pdf
https://archive.org/download/galaxymagazine-1954-11/Galaxy_1954_11_text.pdf
https://archive.org/download/Galaxy_v39n10_1979-07/Galaxy_v39n10_1979-07.pdf
https://archive.org/download/Galaxy_v39n06_1978-06/Galaxy_v39n06_1978-06.pdf
https://archive.org/download/Galaxy_2nd_Series_v02n02_1995-03/Galaxy_2nd_Series_v02n02_1995-03.pdf
https://archive.org/download/galaxymagazine-1950-12/Galaxy_1950_12.pdf
https://archive.org/download/galaxymagazine-1950-12/Galaxy_1950_12_text.pdf
https://archive.org/download/Galaxy_v18n02_1959-12/Galaxy_v18n02_1959-12.pdf
https://archive.org/download/Galaxy_v16n06_1958-10/Galaxy_v16n06_1958-10.pdf
https://archive.org/download/Galaxy_v20n01_1961-10/Galaxy_v20n01_1961-10.pdf
https://archive.org/download/Galaxy_v17n06_1959-08/Galaxy_v17n06_1959-08.pdf
https://archive.org/download/Galaxy_v15n05_1958-03/Galaxy_v15n05_1958-03.pdf
https://archive.org/download/Galaxy_v07n02_1953-11/Galaxy_v07n02_1953-11.pdf
https://archive.org/download/Galaxy_v10n04_1955-07/Galaxy_v10n04_1955-07.pdf
https://archive.org/download/Galaxy_v12n02_1956-06/Galaxy_v12n02_1956-06.pdf
https://archive.org/download/Galaxy_v10n06_1955-09/Galaxy_v10n06_1955-09.pdf
https://archive.org/download/Galaxy_v07n03_1953-12/Galaxy_v07n03_1953-12.pdf
https://archive.org/download/Galaxy_v12n03_1956-07/Galaxy_v12n03_1956-07.pdf
https://archive.org/download/Galaxy_v11n02_1955-11/Galaxy_v11n02_1955-11.pdf
https://archive.org/download/Galaxy_v09n04_1955-01/Galaxy_v09n04_1955-01.pdf
https://archive.org/download/Galaxy_v11n05_1956-03/Galaxy_v11n05_1956-03.pdf
https://archive.org/download/Galaxy_v07n06_1954-03/Galaxy_v07n06_1954-03.pdf
https://archive.org/download/galaxymagazine-1958-07/Galaxy_1958_07.pdf
https://archive.org/download/galaxymagazine-1958-07/Galaxy_1958_07_text.pdf
https://archive.org/download/galaxymagazine-1958-08/Galaxy_1958_08.pdf
https://archive.org/download/galaxymagazine-1958-08/Galaxy_1958_08_text.pdf
https://archive.org/download/galaxymagazine-1957-06/Galaxy_1957_06.pdf
https://archive.org/download/galaxymagazine-1957-06/Galaxy_1957_06_text.pdf
https://archive.org/download/galaxymagazine-1957-07_modified/galaxymagazine-1957-07_modified.pdf
https://archive.org/download/galaxymagazine-1952-04/Galaxy_1952_04.pdf
https://archive.org/download/galaxymagazine-1952-04/Galaxy_1952_04_text.pdf
https://archive.org/download/galaxymagazine-1953-08/Galaxy_1953_08.pdf
https://archive.org/download/galaxymagazine-1953-08/Galaxy_1953_08_text.pdf
https://archive.org/download/galaxymagazine-1954-12/Galaxy_1954_12.pdf
https://archive.org/download/galaxymagazine-1954-12/Galaxy_1954_12_text.pdf
https://archive.org/download/Galaxy_v38n02_1977-04/Galaxy_v38n02_1977-04.pdf
https://archive.org/download/Galaxy_v39n04_1978-04/Galaxy_v39n04_1978-04.pdf
https://archive.org/download/Galaxy_v03n04_1952-01/Galaxy_v03n04_1952-01.pdf
https://archive.org/download/Galaxy_v01n06_1951-03/Galaxy_v01n06_1951-03.pdf
We can make this file beautiful and searchable if this error is corrected: No commas found in this CSV file in line 0.
"identifier"
"Galaxy_v03n03_1951-12"
"Galaxy_v05n03_1952-12"
"Galaxy_v28n01_1969-02"
"Galaxy_v25n04_1967-04"
"Galaxy_v23n05_1965-06"
"Galaxy_v22n02_1963-12"
"Galaxy_v30n01_1970-04"
"Galaxy_v24n06_1966-08"
"Galaxy_v33n08_1973-09"
"Galaxy_v31n01_1970-12"
"Galaxy_v35n05_1974-05"
"Galaxy_v31n03_1971-02"
"Galaxy_v31n06_1971-05-06"
"Galaxy_v37n04_1976-05"
"galaxymagazine-1952-06"
"galaxymagazine-1959-06"
"galaxymagazine-1954-09"
"galaxymagazine-1951-09"
"galaxymagazine-1954-11"
"Galaxy_v39n10_1979-07"
"Galaxy_v39n06_1978-06"
"Galaxy_2nd_Series_v02n02_1995-03"
"Galaxy_2nd_Series_v01n06_1994-11"
"galaxymagazine-1950-12"
"Galaxy_v18n02_1959-12"
"Galaxy_v16n06_1958-10"
"Galaxy_v20n01_1961-10"
"Galaxy_v17n06_1959-08"
"Galaxy_v15n05_1958-03"
"Galaxy_v07n02_1953-11"
"Galaxy_v10n04_1955-07"
"Galaxy_v12n02_1956-06"
"Galaxy_v10n06_1955-09"
"Galaxy_v07n03_1953-12"
"Galaxy_v12n03_1956-07"
"Galaxy_v11n02_1955-11"
"Galaxy_v09n04_1955-01"
"Galaxy_v11n05_1956-03"
"Galaxy_v07n06_1954-03"
"galaxymagazine-1958-07"
"galaxymagazine-1958-08"
"galaxymagazine-1957-06"
"galaxymagazine-1957-07_modified"
"galaxymagazine-1952-04"
"galaxymagazine-1953-08"
"galaxymagazine-1954-12"
"Galaxy_v38n02_1977-04"
"Galaxy_v39n04_1978-04"
"Galaxy_v03n04_1952-01"
"Galaxy_v01n06_1951-03"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment