Skip to content

Instantly share code, notes, and snippets.

@jjjake
Created September 19, 2012 23:00
Show Gist options
  • Save jjjake/3752902 to your computer and use it in GitHub Desktop.
Save jjjake/3752902 to your computer and use it in GitHub Desktop.
openreadingroom.com scripts; it get's pretty ugly!
#!/usr/bin/env python
import os
#import lxml.html, lxml.etree
import lxml.etree
import subprocess
import json
import urllib
ROOT_DIR = os.getcwd()
utf8_parser = lxml.etree.HTMLParser(encoding='utf-8')
#______________________________________________________________________________
def get_articles(url):
phtml = lxml.etree.parse(url.encode('utf-8'), parser=utf8_parser)
return phtml.xpath('//article')
#______________________________________________________________________________
def mk_meta_dict(article):
categories = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="category"]/a[@rel="category tag"]/text()')
tags = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="tags tax"]/a[@rel="tag"]/text()')
return {'post_id': article.get('id').split('-')[-1],
'title': article.xpath('header/div/h2/a[@class="entry-title"]/text()')[0],
'date': article.xpath('span[@class="updated"]')[0].get('title').split('T')[0],
'subject': categories + tags}
#______________________________________________________________________________
def get_pdf(article, post_id):
article_element = article.xpath('div/div[@class="entry entry-content fix"]/a[@class="suf-thumbnail-anchor-left"]')
if article_element:
article_url = article_element[0].get('href')
else:
return None
cmd = 'lwp-request -o links "%s" | grep pdf | cut -f2' % article_url
r = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
url = r.stdout.read()
fname = 'upload/orr-%s_%s.pdf' % (post_id, url.split('/')[-1].split('.')[0])
fname = urllib.unquote(fname).decode('utf-8')
wget = 'wget -q -nc -O %s "%s"' % (fname, url.strip())
subprocess.call(wget, shell=True)
return fname.replace('upload/',''), article_url
#______________________________________________________________________________
def write_meta_dict(meta_dict):
try:
with open('upload/%s.json' % meta_dict['identifier'], 'wb') as f:
f.write(json.dumps(meta_dict))
except:
return None
#______________________________________________________________________________
def upload(identifier):
script_path = os.path.join(ROOT_DIR, 'up.py')
cmd = 'nohup %s %s &' % (script_path, identifier)
call(cmd, shell=True)
#______________________________________________________________________________
for i in range(13,46):
print i
url = 'http://www.openreadingroom.com/page/%s' % i
for article in get_articles(url):
meta_dict = mk_meta_dict(article)
get_pdf_response = get_pdf(article, meta_dict['post_id'])
if not get_pdf_response:
continue
pdf, article_url = get_pdf_response
meta_dict['identifier'] = (u'%s' % pdf.split('.')[0]).encode('ascii', 'ignore').strip('_-')
print "CREATING::\t" + meta_dict['identifier']
parsed_article = lxml.etree.parse(article_url.encode('utf-8'), parser=utf8_parser)
description = parsed_article.xpath('body//div[@class="entry fix"]/p/text()')
if description:
meta_dict['description'] = description[0]
else:
meta_dict['description'] = None
write_meta_dict(meta_dict)
#upload(identifier)
#!/bin/bash
while read item
do
if [[ `curl -s http://archive.org/metadata/$item` != "{}" ]]
then
continue
fi
echo "Uploading: $item"
if [ `ps -U jake u | grep python | wc -l` -gt 6 ]
then
echo "There are 5 threads uploading... let's back off for a second!"
sleep 10
fi
nohup /1/incoming/tmp/openreadingroom/up.py $item &
done < up
#!/usr/bin/env python
import sys
import os
import glob
import time
import json
import codecs
import urllib
import requests
from urllib import urlencode
import boto
from boto.s3.key import Key
from boto.s3.connection import OrdinaryCallingFormat
ROOT_DIR = '/1/incoming/tmp/openreadingroom'
UPLOAD_DIR = '/1/incoming/tmp/openreadingroom/upload'
CON = boto.connect_s3(os.environ['IA_S3_KEY'], os.environ['IA_S3_SECRET'],
host='s3.us.archive.org', is_secure=False,
calling_format=OrdinaryCallingFormat())
# S3 Uploader >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#______________________________________________________________________________
def compile_meta_dict():
meta_dict = json.load(codecs.open('upload/%s.json' % sys.argv[1], encoding='utf-8'))
filename = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '').decode('utf-8') #####
header_dict = {'x-archive-queue-derive': '0',
'x-archive-meta-noindex': 'true',
'x-amz-auto-make-bucket': '1',
'x-archive-meta-mediatype': 'texts',
'x-archive-meta-language': 'tam',
'x-archive-meta-date': str(meta_dict['date']),
'x-archive-meta-description': meta_dict['description'],
'x-archive-meta-collection': 'openreadingroom',
'x-archive-meta-title': meta_dict['title']}
if meta_dict.get('description'):
header_dict['x-archive-meta-description'] = meta_dict['description']
i=0
for s in meta_dict['subject']:
if i == 0:
header_dict['x-archive-meta01-subject'] = s
i+=2
else:
header_dict['x-archive-meta0%s-subject' % i] = s
i+=1
return {k: v.encode('utf-8') for k,v in header_dict.iteritems() if v}
#______________________________________________________________________________
def get_bucket(bucket_name):
print '\t\tGetting Bucket...'
bucket = CON.lookup(bucket_name)
#bucket = CON.get_bucket(bucket_name)
if bucket is not None:
print '\t\tFound existing bucket %s' % bucket_name
return bucket
print '\t\tCreating new bucket %s' % bucket_name
headers = compile_meta_dict()
bucket = CON.create_bucket(bucket_name, headers=headers)
i=0
while i<60:
b = CON.lookup(bucket_name)
if b is not None:
return bucket
print '\t\tWaiting for bucket creation...'
time.sleep(10)
i+=1
raise NameError("Could not create or lookup " + bucket_name)
#______________________________________________________________________________
def create_item(identifier):
bucket = get_bucket(identifier)
try:
if len(identifier) <= 10:
filename = glob.glob('upload/%s_*.pdf' % sys.argv[1])[0].replace('upload/', '')
else:
filename = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '')
except IndexError:
return
if bucket.get_key(filename) is not None:
print '\t\tWARNING::\tFile already exists, not deleting from server!'
return None
filepath = os.path.join(UPLOAD_DIR, filename)
k = Key(bucket)
up_headers = {'x-archive-queue-derive': '0'}
# Contrib submit pdfs to preserve filenames with funky characters.
url = "http://www.us.archive.org/contrib_submit.php?"
params = dict(identifier=identifier,
submitter="jake@archive.org",
update_mode=1,
no_derive=1)
#for pdf in pdfs:
#item = item.split('/')[-1]
if bucket.get_key(filename):
return None
filename = filename.decode('utf-8').encode('utf-8')
params['from_url'] = "rsync://collections.us.archive.org/incoming_1/tmp/openreadingroom/upload/%s" % filename
req = requests.get(url, params=urlencode(params))
print "CREATED::\thttp://archive.org/details/%s" % identifier
#______________________________________________________________________________
create_item(sys.argv[1])
@jjjake
Copy link
Author

jjjake commented Sep 19, 2012

This was a big hack, you don't want it. But, it worked! +1,500 Indian texts archived:

http://archive.org/details/openreadingroom

Move along ;-).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment