Dragon Dave McKee scraperdragon

## gist:5081659
install.packages("RSQLite") [note: compiles SQLite]
library(RSQLite)
drv <- dbDriver("SQLite")
con <- dbConnect(drv, "demo.sqlite")
dbListTables(con)
dbListFields(con, "table_name")

## run
#!/bin/bash
x=$(date +%Y%m%dT%H%M%S)
mkdir -p ~/log
$@ > ~/log/$x 2>&1
error=$?
if [ $error != 0 ]
then
  echo "Error code: $error"
  cat ~/log/$x
  curl --data "type=error" https://x.scraperwiki.com/api/status > /dev/null 2>&1

## gist:4260142
def parsedate(datestring, silent=False):
  import dateutil.parser
  import re
  if not datestring: return None
  if re.match('\d{4}-\d{2}-\d{2}', datestring): return datestring
  info=dateutil.parser.parserinfo(dayfirst=True)
  value=dateutil.parser.parser(info)._parse(datestring)
  if value==None: return None
  retval=[value.year, value.month, value.day]
  nones = retval.count(None)

## gist:4001096
import requests
requests.defaults.defaults['max_retries'] = 5
# ... rest of code ...

## gist:3946977
fs=require 'fs'

settings = fs.readFileSync 'scraperwiki.json'
settings = JSON.parse settings

## double-encoding-fixes.py
import re

# functions to detect/fix double-encoded UTF-8 strings
# Based on http://blogs.perl.org/users/chansen/2010/10/coping-with-double-encoded-utf-8.html
DOUBLE_ENCODED = re.compile("""
\xC3 (?: [\x82-\x9F] \xC2 [\x80-\xBF]                                    # U+0080 - U+07FF
       |  \xA0       \xC2 [\xA0-\xBF] \xC2 [\x80-\xBF]                   # U+0800 - U+0FFF
       | [\xA1-\xAC] \xC2 [\x80-\xBF] \xC2 [\x80-\xBF]                   # U+1000 - U+CFFF
       |  \xAD       \xC2 [\x80-\x9F] \xC2 [\x80-\xBF]                   # U+D000 - U+D7FF
       | [\xAE-\xAF] \xC2 [\x80-\xBF] \xC2 [\x80-\xBF]                   # U+E000 - U+FFFF

## gist:3665070
def makeidentifier(s):
    import string
    s=s.strip().replace(' ','_')
    valid_chars = "_%s%s" % (string.ascii_letters, string.digits)
    out=''.join(c for c in s if c in valid_chars)
    if len(out)==0:
        return '_'
    else:
        return out

## gist:3634011
import codecs
import sys
sys.stdout = codecs.getwriter('utf-8')(sys.__stdout__)

## gist:3621256
def get_select_value(node):
    # node is an LXML element (SELECT tag)
    try:
        return node.cssselect("option[selected='selected']")[0].text
    except IndexError:
        return node.cssselect("option")[0].text

## chrome2requests.py
dict([[h.partition(':')[0], h.partition(':')[2]] for h in rawheaders.split('\n')])
	install.packages("RSQLite") [note: compiles SQLite]
	library(RSQLite)
	drv <- dbDriver("SQLite")
	con <- dbConnect(drv, "demo.sqlite")
	dbListTables(con)
	dbListFields(con, "table_name")
	#!/bin/bash
	x=$(date +%Y%m%dT%H%M%S)
	mkdir -p ~/log
	$@ > ~/log/$x 2>&1
	error=$?
	if [ $error != 0 ]
	then
	echo "Error code: $error"
	cat ~/log/$x
	curl --data "type=error" https://x.scraperwiki.com/api/status > /dev/null 2>&1
	def parsedate(datestring, silent=False):
	import dateutil.parser
	import re
	if not datestring: return None
	if re.match('\d{4}-\d{2}-\d{2}', datestring): return datestring
	info=dateutil.parser.parserinfo(dayfirst=True)
	value=dateutil.parser.parser(info)._parse(datestring)
	if value==None: return None
	retval=[value.year, value.month, value.day]
	nones = retval.count(None)
	import requests
	requests.defaults.defaults['max_retries'] = 5
	# ... rest of code ...
	fs=require 'fs'

	settings = fs.readFileSync 'scraperwiki.json'
	settings = JSON.parse settings
	import re

	# functions to detect/fix double-encoded UTF-8 strings
	# Based on http://blogs.perl.org/users/chansen/2010/10/coping-with-double-encoded-utf-8.html
	DOUBLE_ENCODED = re.compile("""
	\xC3 (?: [\x82-\x9F] \xC2 [\x80-\xBF] # U+0080 - U+07FF
	\| \xA0 \xC2 [\xA0-\xBF] \xC2 [\x80-\xBF] # U+0800 - U+0FFF
	\| [\xA1-\xAC] \xC2 [\x80-\xBF] \xC2 [\x80-\xBF] # U+1000 - U+CFFF
	\| \xAD \xC2 [\x80-\x9F] \xC2 [\x80-\xBF] # U+D000 - U+D7FF
	\| [\xAE-\xAF] \xC2 [\x80-\xBF] \xC2 [\x80-\xBF] # U+E000 - U+FFFF
	def makeidentifier(s):
	import string
	s=s.strip().replace(' ','_')
	valid_chars = "_%s%s" % (string.ascii_letters, string.digits)
	out=''.join(c for c in s if c in valid_chars)
	if len(out)==0:
	return '_'
	else:
	return out
	import codecs
	import sys
	sys.stdout = codecs.getwriter('utf-8')(sys.__stdout__)
	def get_select_value(node):
	# node is an LXML element (SELECT tag)
	try:
	return node.cssselect("option[selected='selected']")[0].text
	except IndexError:
	return node.cssselect("option")[0].text