Tom Morris tfmorris

## gist:6c2a40bc7c7de9753a1d9c64b5ae2420
# Humans with the most non-deprecated OpenLibrary IDs (merge candidates)
SELECT ?item (COUNT(?olid) AS ?olidC)
{
  VALUES (?ranks) { ( wikibase:PreferredRank ) ( wikibase:NormalRank ) }
    ?item p:P648 [ps:P648 ?olid;
                   wikibase:rank ?ranks;
                  ] ;
          wdt:P31 wd:Q5.
#     SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}

## IntAccumulator.java
import java.util.Collections;
import java.util.EnumSet;
import java.util.IntSummaryStatistics;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.function.ToIntFunction;
import java.util.stream.Collector;

## common-crawl-cdx.py
# -*- coding: utf-8 -*-
"""
common-crawl-cdx.py

A simple example program to analyze the Common Crawl index.

This is implemented as a single stream job which accesses S3 via HTTP,
so that it can be easily be run from any laptop, but it could easily be
converted to an EMR job which processed the 300 index files in parallel.

## install.py
import shutil
import urllib2
import platform
import tempfile
import urllib
import os
import subprocess
import webbrowser
import stat

## desertislanddiscs.py
# Scrape BBC Desert Island Discs data including songs, books, and luxury item, if available, for the celebrity "castaways"
# based on original work by Francis Irving with the following changes by Tom Morris July 2012:
#  - updated to current BBC page format
#  - switched from BeautifulSoup to lxml
#  - updated deprecated database calls
#  - restructured to run as a single integrated process and not rescrape data it already extracted

import scraperwiki
import scraperwiki.apiwrapper
import lxml.html

## abbyy2hocr.xsl
<?xml version='1.0' encoding='utf-8'?>
<xsl:stylesheet version='1.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
<!--
Author: Rod Page
Source: http://iphylo.blogspot.com/2011/07/correcting-ocr-using-hocr-firefox.html#comment-400434491
-->
<xsl:output method='html' version='1.0' encoding='utf-8' indent='yes'/>


<xsl:variable name="scale" select="800 div //page/@width" />
	# Humans with the most non-deprecated OpenLibrary IDs (merge candidates)
	SELECT ?item (COUNT(?olid) AS ?olidC)
	{
	VALUES (?ranks) { ( wikibase:PreferredRank ) ( wikibase:NormalRank ) }
	?item p:P648 [ps:P648 ?olid;
	wikibase:rank ?ranks;
	] ;
	wdt:P31 wd:Q5.
	# SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
	}
	import java.util.Collections;
	import java.util.EnumSet;
	import java.util.IntSummaryStatistics;
	import java.util.Set;
	import java.util.function.BiConsumer;
	import java.util.function.BinaryOperator;
	import java.util.function.Function;
	import java.util.function.Supplier;
	import java.util.function.ToIntFunction;
	import java.util.stream.Collector;
	# -- coding: utf-8 --
	"""
	common-crawl-cdx.py

	A simple example program to analyze the Common Crawl index.

	This is implemented as a single stream job which accesses S3 via HTTP,
	so that it can be easily be run from any laptop, but it could easily be
	converted to an EMR job which processed the 300 index files in parallel.
	import shutil
	import urllib2
	import platform
	import tempfile
	import urllib
	import os
	import subprocess
	import webbrowser
	import stat
	# Scrape BBC Desert Island Discs data including songs, books, and luxury item, if available, for the celebrity "castaways"
	# based on original work by Francis Irving with the following changes by Tom Morris July 2012:
	# - updated to current BBC page format
	# - switched from BeautifulSoup to lxml
	# - updated deprecated database calls
	# - restructured to run as a single integrated process and not rescrape data it already extracted

	import scraperwiki
	import scraperwiki.apiwrapper
	import lxml.html
	<?xml version='1.0' encoding='utf-8'?>
	<xsl:stylesheet version='1.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
	<!--
	Author: Rod Page
	Source: http://iphylo.blogspot.com/2011/07/correcting-ocr-using-hocr-firefox.html#comment-400434491
	-->
	<xsl:output method='html' version='1.0' encoding='utf-8' indent='yes'/>


	<xsl:variable name="scale" select="800 div //page/@width" />