Created
September 19, 2012 23:00
-
-
Save jjjake/3752902 to your computer and use it in GitHub Desktop.
openreadingroom.com scripts; it get's pretty ugly!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
#import lxml.html, lxml.etree | |
import lxml.etree | |
import subprocess | |
import json | |
import urllib | |
ROOT_DIR = os.getcwd() | |
utf8_parser = lxml.etree.HTMLParser(encoding='utf-8') | |
#______________________________________________________________________________ | |
def get_articles(url): | |
phtml = lxml.etree.parse(url.encode('utf-8'), parser=utf8_parser) | |
return phtml.xpath('//article') | |
#______________________________________________________________________________ | |
def mk_meta_dict(article): | |
categories = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="category"]/a[@rel="category tag"]/text()') | |
tags = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="tags tax"]/a[@rel="tag"]/text()') | |
return {'post_id': article.get('id').split('-')[-1], | |
'title': article.xpath('header/div/h2/a[@class="entry-title"]/text()')[0], | |
'date': article.xpath('span[@class="updated"]')[0].get('title').split('T')[0], | |
'subject': categories + tags} | |
#______________________________________________________________________________ | |
def get_pdf(article, post_id): | |
article_element = article.xpath('div/div[@class="entry entry-content fix"]/a[@class="suf-thumbnail-anchor-left"]') | |
if article_element: | |
article_url = article_element[0].get('href') | |
else: | |
return None | |
cmd = 'lwp-request -o links "%s" | grep pdf | cut -f2' % article_url | |
r = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
url = r.stdout.read() | |
fname = 'upload/orr-%s_%s.pdf' % (post_id, url.split('/')[-1].split('.')[0]) | |
fname = urllib.unquote(fname).decode('utf-8') | |
wget = 'wget -q -nc -O %s "%s"' % (fname, url.strip()) | |
subprocess.call(wget, shell=True) | |
return fname.replace('upload/',''), article_url | |
#______________________________________________________________________________ | |
def write_meta_dict(meta_dict): | |
try: | |
with open('upload/%s.json' % meta_dict['identifier'], 'wb') as f: | |
f.write(json.dumps(meta_dict)) | |
except: | |
return None | |
#______________________________________________________________________________ | |
def upload(identifier): | |
script_path = os.path.join(ROOT_DIR, 'up.py') | |
cmd = 'nohup %s %s &' % (script_path, identifier) | |
call(cmd, shell=True) | |
#______________________________________________________________________________ | |
for i in range(13,46): | |
print i | |
url = 'http://www.openreadingroom.com/page/%s' % i | |
for article in get_articles(url): | |
meta_dict = mk_meta_dict(article) | |
get_pdf_response = get_pdf(article, meta_dict['post_id']) | |
if not get_pdf_response: | |
continue | |
pdf, article_url = get_pdf_response | |
meta_dict['identifier'] = (u'%s' % pdf.split('.')[0]).encode('ascii', 'ignore').strip('_-') | |
print "CREATING::\t" + meta_dict['identifier'] | |
parsed_article = lxml.etree.parse(article_url.encode('utf-8'), parser=utf8_parser) | |
description = parsed_article.xpath('body//div[@class="entry fix"]/p/text()') | |
if description: | |
meta_dict['description'] = description[0] | |
else: | |
meta_dict['description'] = None | |
write_meta_dict(meta_dict) | |
#upload(identifier) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
while read item | |
do | |
if [[ `curl -s http://archive.org/metadata/$item` != "{}" ]] | |
then | |
continue | |
fi | |
echo "Uploading: $item" | |
if [ `ps -U jake u | grep python | wc -l` -gt 6 ] | |
then | |
echo "There are 5 threads uploading... let's back off for a second!" | |
sleep 10 | |
fi | |
nohup /1/incoming/tmp/openreadingroom/up.py $item & | |
done < up | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import glob | |
import time | |
import json | |
import codecs | |
import urllib | |
import requests | |
from urllib import urlencode | |
import boto | |
from boto.s3.key import Key | |
from boto.s3.connection import OrdinaryCallingFormat | |
ROOT_DIR = '/1/incoming/tmp/openreadingroom' | |
UPLOAD_DIR = '/1/incoming/tmp/openreadingroom/upload' | |
CON = boto.connect_s3(os.environ['IA_S3_KEY'], os.environ['IA_S3_SECRET'], | |
host='s3.us.archive.org', is_secure=False, | |
calling_format=OrdinaryCallingFormat()) | |
# S3 Uploader >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> | |
#______________________________________________________________________________ | |
def compile_meta_dict(): | |
meta_dict = json.load(codecs.open('upload/%s.json' % sys.argv[1], encoding='utf-8')) | |
filename = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '').decode('utf-8') ##### | |
header_dict = {'x-archive-queue-derive': '0', | |
'x-archive-meta-noindex': 'true', | |
'x-amz-auto-make-bucket': '1', | |
'x-archive-meta-mediatype': 'texts', | |
'x-archive-meta-language': 'tam', | |
'x-archive-meta-date': str(meta_dict['date']), | |
'x-archive-meta-description': meta_dict['description'], | |
'x-archive-meta-collection': 'openreadingroom', | |
'x-archive-meta-title': meta_dict['title']} | |
if meta_dict.get('description'): | |
header_dict['x-archive-meta-description'] = meta_dict['description'] | |
i=0 | |
for s in meta_dict['subject']: | |
if i == 0: | |
header_dict['x-archive-meta01-subject'] = s | |
i+=2 | |
else: | |
header_dict['x-archive-meta0%s-subject' % i] = s | |
i+=1 | |
return {k: v.encode('utf-8') for k,v in header_dict.iteritems() if v} | |
#______________________________________________________________________________ | |
def get_bucket(bucket_name): | |
print '\t\tGetting Bucket...' | |
bucket = CON.lookup(bucket_name) | |
#bucket = CON.get_bucket(bucket_name) | |
if bucket is not None: | |
print '\t\tFound existing bucket %s' % bucket_name | |
return bucket | |
print '\t\tCreating new bucket %s' % bucket_name | |
headers = compile_meta_dict() | |
bucket = CON.create_bucket(bucket_name, headers=headers) | |
i=0 | |
while i<60: | |
b = CON.lookup(bucket_name) | |
if b is not None: | |
return bucket | |
print '\t\tWaiting for bucket creation...' | |
time.sleep(10) | |
i+=1 | |
raise NameError("Could not create or lookup " + bucket_name) | |
#______________________________________________________________________________ | |
def create_item(identifier): | |
bucket = get_bucket(identifier) | |
try: | |
if len(identifier) <= 10: | |
filename = glob.glob('upload/%s_*.pdf' % sys.argv[1])[0].replace('upload/', '') | |
else: | |
filename = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '') | |
except IndexError: | |
return | |
if bucket.get_key(filename) is not None: | |
print '\t\tWARNING::\tFile already exists, not deleting from server!' | |
return None | |
filepath = os.path.join(UPLOAD_DIR, filename) | |
k = Key(bucket) | |
up_headers = {'x-archive-queue-derive': '0'} | |
# Contrib submit pdfs to preserve filenames with funky characters. | |
url = "http://www.us.archive.org/contrib_submit.php?" | |
params = dict(identifier=identifier, | |
submitter="jake@archive.org", | |
update_mode=1, | |
no_derive=1) | |
#for pdf in pdfs: | |
#item = item.split('/')[-1] | |
if bucket.get_key(filename): | |
return None | |
filename = filename.decode('utf-8').encode('utf-8') | |
params['from_url'] = "rsync://collections.us.archive.org/incoming_1/tmp/openreadingroom/upload/%s" % filename | |
req = requests.get(url, params=urlencode(params)) | |
print "CREATED::\thttp://archive.org/details/%s" % identifier | |
#______________________________________________________________________________ | |
create_item(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This was a big hack, you don't want it. But, it worked! +1,500 Indian texts archived:
http://archive.org/details/openreadingroom
Move along ;-).