Skip to content

Instantly share code, notes, and snippets.

<?xml version='1.0' encoding='utf-8'?>
<xsl:stylesheet version='1.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
<!--
Author: Rod Page
Source: http://iphylo.blogspot.com/2011/07/correcting-ocr-using-hocr-firefox.html#comment-400434491
-->
<xsl:output method='html' version='1.0' encoding='utf-8' indent='yes'/>
<xsl:variable name="scale" select="800 div //page/@width" />
@tfmorris
tfmorris / desertislanddiscs.py
Last active December 19, 2015 21:48
BBC Desert Island Discs scraper for the current/old scraperwiki Until scraperwiki shutsdown original is at https://scraperwiki.com/scrapers/desert_island_discs_records/
# Scrape BBC Desert Island Discs data including songs, books, and luxury item, if available, for the celebrity "castaways"
# based on original work by Francis Irving with the following changes by Tom Morris July 2012:
# - updated to current BBC page format
# - switched from BeautifulSoup to lxml
# - updated deprecated database calls
# - restructured to run as a single integrated process and not rescrape data it already extracted
import scraperwiki
import scraperwiki.apiwrapper
import lxml.html
@tfmorris
tfmorris / install.py
Last active August 29, 2015 13:57 — forked from bwhite/install.py
import shutil
import urllib2
import platform
import tempfile
import urllib
import os
import subprocess
import webbrowser
import stat
# -*- coding: utf-8 -*-
"""
common-crawl-cdx.py
A simple example program to analyze the Common Crawl index.
This is implemented as a single stream job which accesses S3 via HTTP,
so that it can be easily be run from any laptop, but it could easily be
converted to an EMR job which processed the 300 index files in parallel.
@tfmorris
tfmorris / IntAccumulator.java
Created March 29, 2016 18:57
Online variation and standard deviation using Welford's algorithm and Java 8 Streams - just a sketch! only lightly tested!!
import java.util.Collections;
import java.util.EnumSet;
import java.util.IntSummaryStatistics;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.function.ToIntFunction;
import java.util.stream.Collector;
@tfmorris
tfmorris / gist:6c2a40bc7c7de9753a1d9c64b5ae2420
Created July 27, 2022 21:12
Wikidata SPARQL query for duplicate OpenLibrary author IDs
# Humans with the most non-deprecated OpenLibrary IDs (merge candidates)
SELECT ?item (COUNT(?olid) AS ?olidC)
{
VALUES (?ranks) { ( wikibase:PreferredRank ) ( wikibase:NormalRank ) }
?item p:P648 [ps:P648 ?olid;
wikibase:rank ?ranks;
] ;
wdt:P31 wd:Q5.
# SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}