jjjake/get.py

## get.py
#!/usr/bin/env python
import os
#import lxml.html, lxml.etree
import lxml.etree
import subprocess
import json
import urllib

ROOT_DIR = os.getcwd()
utf8_parser = lxml.etree.HTMLParser(encoding='utf-8')


#______________________________________________________________________________
def get_articles(url):
    phtml = lxml.etree.parse(url.encode('utf-8'), parser=utf8_parser)
    return phtml.xpath('//article')

#______________________________________________________________________________
def mk_meta_dict(article):
    categories = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="category"]/a[@rel="category tag"]/text()')
    tags = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="tags tax"]/a[@rel="tag"]/text()')
    return {'post_id': article.get('id').split('-')[-1],
            'title': article.xpath('header/div/h2/a[@class="entry-title"]/text()')[0],
            'date': article.xpath('span[@class="updated"]')[0].get('title').split('T')[0],
            'subject': categories + tags}

#______________________________________________________________________________
def get_pdf(article, post_id):
    article_element = article.xpath('div/div[@class="entry entry-content fix"]/a[@class="suf-thumbnail-anchor-left"]')
    if article_element:
        article_url = article_element[0].get('href')
    else:
        return None
    cmd = 'lwp-request -o links  "%s" | grep pdf | cut -f2' % article_url
    r = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    url = r.stdout.read()
    fname = 'upload/orr-%s_%s.pdf' % (post_id, url.split('/')[-1].split('.')[0])
    fname = urllib.unquote(fname).decode('utf-8')
    wget = 'wget -q -nc -O %s "%s"' % (fname,  url.strip())
    subprocess.call(wget, shell=True)
    return fname.replace('upload/',''), article_url

#______________________________________________________________________________
def write_meta_dict(meta_dict):
    try:
        with open('upload/%s.json' % meta_dict['identifier'], 'wb') as f:
            f.write(json.dumps(meta_dict))
    except:
        return None

#______________________________________________________________________________
def upload(identifier):
    script_path = os.path.join(ROOT_DIR, 'up.py')
    cmd = 'nohup %s %s &' % (script_path, identifier)
    call(cmd, shell=True)

#______________________________________________________________________________
for i in range(13,46):
    print i
    url = 'http://www.openreadingroom.com/page/%s' % i
    for article in get_articles(url):
        meta_dict = mk_meta_dict(article)
        get_pdf_response = get_pdf(article, meta_dict['post_id'])
        if not get_pdf_response:
            continue
        pdf, article_url = get_pdf_response
        meta_dict['identifier'] = (u'%s' % pdf.split('.')[0]).encode('ascii', 'ignore').strip('_-')
        print "CREATING::\t" + meta_dict['identifier']
        parsed_article = lxml.etree.parse(article_url.encode('utf-8'), parser=utf8_parser)
        description = parsed_article.xpath('body//div[@class="entry fix"]/p/text()')
        if description:
            meta_dict['description'] = description[0]
        else:
            meta_dict['description'] = None
        write_meta_dict(meta_dict)
        #upload(identifier)

## threaded_up.sh
#!/bin/bash

while read item
do
    if [[ `curl -s http://archive.org/metadata/$item` != "{}" ]]
    then
        continue
    fi
    echo "Uploading: $item"
    if [ `ps -U jake u | grep python | wc -l` -gt 6 ]
    then
        echo "There are 5 threads uploading... let's back off for a second!"
        sleep 10
    fi
    nohup /1/incoming/tmp/openreadingroom/up.py $item &
done < up


## up.py
#!/usr/bin/env python
import sys
import os
import glob
import time
import json
import codecs
import urllib
import requests
from urllib import urlencode

import boto
from boto.s3.key import Key
from boto.s3.connection import OrdinaryCallingFormat

ROOT_DIR = '/1/incoming/tmp/openreadingroom'
UPLOAD_DIR = '/1/incoming/tmp/openreadingroom/upload'
CON = boto.connect_s3(os.environ['IA_S3_KEY'], os.environ['IA_S3_SECRET'],
                      host='s3.us.archive.org', is_secure=False,
                      calling_format=OrdinaryCallingFormat())


# S3 Uploader >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#______________________________________________________________________________
def compile_meta_dict():
    meta_dict = json.load(codecs.open('upload/%s.json' % sys.argv[1], encoding='utf-8'))
    filename  = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '').decode('utf-8') #####
    header_dict = {'x-archive-queue-derive': '0',
                   'x-archive-meta-noindex': 'true',
                   'x-amz-auto-make-bucket': '1',
                   'x-archive-meta-mediatype': 'texts',
                   'x-archive-meta-language': 'tam',
                   'x-archive-meta-date': str(meta_dict['date']),
                   'x-archive-meta-description': meta_dict['description'],
                   'x-archive-meta-collection': 'openreadingroom',
                   'x-archive-meta-title': meta_dict['title']}
    if meta_dict.get('description'):
        header_dict['x-archive-meta-description'] = meta_dict['description']
    i=0
    for s in meta_dict['subject']:
        if i == 0:
            header_dict['x-archive-meta01-subject'] = s
            i+=2
        else:
            header_dict['x-archive-meta0%s-subject' % i] = s
            i+=1
    return {k: v.encode('utf-8') for k,v in header_dict.iteritems() if v}

#______________________________________________________________________________
def get_bucket(bucket_name):
    print '\t\tGetting Bucket...'
    bucket = CON.lookup(bucket_name)
    #bucket = CON.get_bucket(bucket_name)
    if bucket is not None:
        print '\t\tFound existing bucket %s' % bucket_name
        return bucket
    print '\t\tCreating new bucket %s' % bucket_name
    headers = compile_meta_dict()
    bucket = CON.create_bucket(bucket_name, headers=headers)
    i=0
    while i<60:
        b = CON.lookup(bucket_name)
        if b is not None:
            return bucket
        print '\t\tWaiting for bucket creation...'
        time.sleep(10)
        i+=1
    raise NameError("Could not create or lookup " + bucket_name)

#______________________________________________________________________________
def create_item(identifier):
    bucket = get_bucket(identifier)
    try:
        if len(identifier) <= 10:
            filename  = glob.glob('upload/%s_*.pdf' % sys.argv[1])[0].replace('upload/', '')
        else:
            filename  = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '')
    except IndexError:
        return
    if bucket.get_key(filename) is not None:
        print '\t\tWARNING::\tFile already exists, not deleting from server!'
        return None
    filepath = os.path.join(UPLOAD_DIR, filename)
    k = Key(bucket)
    up_headers = {'x-archive-queue-derive': '0'}

    # Contrib submit pdfs to preserve filenames with funky characters.
    url = "http://www.us.archive.org/contrib_submit.php?"
    params = dict(identifier=identifier,
                  submitter="jake@archive.org",
                  update_mode=1,
                  no_derive=1)
    #for pdf in pdfs:
    #item = item.split('/')[-1]
    if bucket.get_key(filename):
        return None
    filename = filename.decode('utf-8').encode('utf-8')
    params['from_url'] = "rsync://collections.us.archive.org/incoming_1/tmp/openreadingroom/upload/%s" % filename
    req = requests.get(url, params=urlencode(params))
    print "CREATED::\thttp://archive.org/details/%s" % identifier

#______________________________________________________________________________
create_item(sys.argv[1])
	#!/usr/bin/env python
	import os
	#import lxml.html, lxml.etree
	import lxml.etree
	import subprocess
	import json
	import urllib

	ROOT_DIR = os.getcwd()
	utf8_parser = lxml.etree.HTMLParser(encoding='utf-8')


	#______________________________________________________________________________
	def get_articles(url):
	phtml = lxml.etree.parse(url.encode('utf-8'), parser=utf8_parser)
	return phtml.xpath('//article')

	#______________________________________________________________________________
	def mk_meta_dict(article):
	categories = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="category"]/a[@rel="category tag"]/text()')
	tags = article.xpath('div/div[@class="meta-pullout meta-left-pullout"]/ul/li/span[@class="tags tax"]/a[@rel="tag"]/text()')
	return {'post_id': article.get('id').split('-')[-1],
	'title': article.xpath('header/div/h2/a[@class="entry-title"]/text()')[0],
	'date': article.xpath('span[@class="updated"]')[0].get('title').split('T')[0],
	'subject': categories + tags}

	#______________________________________________________________________________
	def get_pdf(article, post_id):
	article_element = article.xpath('div/div[@class="entry entry-content fix"]/a[@class="suf-thumbnail-anchor-left"]')
	if article_element:
	article_url = article_element[0].get('href')
	else:
	return None
	cmd = 'lwp-request -o links "%s" \| grep pdf \| cut -f2' % article_url
	r = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	url = r.stdout.read()
	fname = 'upload/orr-%s_%s.pdf' % (post_id, url.split('/')[-1].split('.')[0])
	fname = urllib.unquote(fname).decode('utf-8')
	wget = 'wget -q -nc -O %s "%s"' % (fname, url.strip())
	subprocess.call(wget, shell=True)
	return fname.replace('upload/',''), article_url

	#______________________________________________________________________________
	def write_meta_dict(meta_dict):
	try:
	with open('upload/%s.json' % meta_dict['identifier'], 'wb') as f:
	f.write(json.dumps(meta_dict))
	except:
	return None

	#______________________________________________________________________________
	def upload(identifier):
	script_path = os.path.join(ROOT_DIR, 'up.py')
	cmd = 'nohup %s %s &' % (script_path, identifier)
	call(cmd, shell=True)

	#______________________________________________________________________________
	for i in range(13,46):
	print i
	url = 'http://www.openreadingroom.com/page/%s' % i
	for article in get_articles(url):
	meta_dict = mk_meta_dict(article)
	get_pdf_response = get_pdf(article, meta_dict['post_id'])
	if not get_pdf_response:
	continue
	pdf, article_url = get_pdf_response
	meta_dict['identifier'] = (u'%s' % pdf.split('.')[0]).encode('ascii', 'ignore').strip('_-')
	print "CREATING::\t" + meta_dict['identifier']
	parsed_article = lxml.etree.parse(article_url.encode('utf-8'), parser=utf8_parser)
	description = parsed_article.xpath('body//div[@class="entry fix"]/p/text()')
	if description:
	meta_dict['description'] = description[0]
	else:
	meta_dict['description'] = None
	write_meta_dict(meta_dict)
	#upload(identifier)
	#!/bin/bash

	while read item
	do
	if [[ `curl -s http://archive.org/metadata/$item` != "{}" ]]
	then
	continue
	fi
	echo "Uploading: $item"
	if [ `ps -U jake u \| grep python \| wc -l` -gt 6 ]
	then
	echo "There are 5 threads uploading... let's back off for a second!"
	sleep 10
	fi
	nohup /1/incoming/tmp/openreadingroom/up.py $item &
	done < up
	#!/usr/bin/env python
	import sys
	import os
	import glob
	import time
	import json
	import codecs
	import urllib
	import requests
	from urllib import urlencode

	import boto
	from boto.s3.key import Key
	from boto.s3.connection import OrdinaryCallingFormat

	ROOT_DIR = '/1/incoming/tmp/openreadingroom'
	UPLOAD_DIR = '/1/incoming/tmp/openreadingroom/upload'
	CON = boto.connect_s3(os.environ['IA_S3_KEY'], os.environ['IA_S3_SECRET'],
	host='s3.us.archive.org', is_secure=False,
	calling_format=OrdinaryCallingFormat())


	# S3 Uploader >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
	#______________________________________________________________________________
	def compile_meta_dict():
	meta_dict = json.load(codecs.open('upload/%s.json' % sys.argv[1], encoding='utf-8'))
	filename = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '').decode('utf-8') #####
	header_dict = {'x-archive-queue-derive': '0',
	'x-archive-meta-noindex': 'true',
	'x-amz-auto-make-bucket': '1',
	'x-archive-meta-mediatype': 'texts',
	'x-archive-meta-language': 'tam',
	'x-archive-meta-date': str(meta_dict['date']),
	'x-archive-meta-description': meta_dict['description'],
	'x-archive-meta-collection': 'openreadingroom',
	'x-archive-meta-title': meta_dict['title']}
	if meta_dict.get('description'):
	header_dict['x-archive-meta-description'] = meta_dict['description']
	i=0
	for s in meta_dict['subject']:
	if i == 0:
	header_dict['x-archive-meta01-subject'] = s
	i+=2
	else:
	header_dict['x-archive-meta0%s-subject' % i] = s
	i+=1
	return {k: v.encode('utf-8') for k,v in header_dict.iteritems() if v}

	#______________________________________________________________________________
	def get_bucket(bucket_name):
	print '\t\tGetting Bucket...'
	bucket = CON.lookup(bucket_name)
	#bucket = CON.get_bucket(bucket_name)
	if bucket is not None:
	print '\t\tFound existing bucket %s' % bucket_name
	return bucket
	print '\t\tCreating new bucket %s' % bucket_name
	headers = compile_meta_dict()
	bucket = CON.create_bucket(bucket_name, headers=headers)
	i=0
	while i<60:
	b = CON.lookup(bucket_name)
	if b is not None:
	return bucket
	print '\t\tWaiting for bucket creation...'
	time.sleep(10)
	i+=1
	raise NameError("Could not create or lookup " + bucket_name)

	#______________________________________________________________________________
	def create_item(identifier):
	bucket = get_bucket(identifier)
	try:
	if len(identifier) <= 10:
	filename = glob.glob('upload/%s_*.pdf' % sys.argv[1])[0].replace('upload/', '')
	else:
	filename = glob.glob('upload/%s*.pdf' % sys.argv[1])[0].replace('upload/', '')
	except IndexError:
	return
	if bucket.get_key(filename) is not None:
	print '\t\tWARNING::\tFile already exists, not deleting from server!'
	return None
	filepath = os.path.join(UPLOAD_DIR, filename)
	k = Key(bucket)
	up_headers = {'x-archive-queue-derive': '0'}

	# Contrib submit pdfs to preserve filenames with funky characters.
	url = "http://www.us.archive.org/contrib_submit.php?"
	params = dict(identifier=identifier,
	submitter="jake@archive.org",
	update_mode=1,
	no_derive=1)
	#for pdf in pdfs:
	#item = item.split('/')[-1]
	if bucket.get_key(filename):
	return None
	filename = filename.decode('utf-8').encode('utf-8')
	params['from_url'] = "rsync://collections.us.archive.org/incoming_1/tmp/openreadingroom/upload/%s" % filename
	req = requests.get(url, params=urlencode(params))
	print "CREATED::\thttp://archive.org/details/%s" % identifier

	#______________________________________________________________________________
	create_item(sys.argv[1])