Jason Ronallo jronallo

## NC-HB2-ids.sh
# This script requires the jq utility
# https://stedolan.github.io/jq/
# Datasets created with twarc
# https://github.com/DocNow/twarc

mkdir -p NCHB2-ids
rm NCHB2-ids/NCHB2*
touch NCHB2-ids/NCHB2-ids-with-dupes.txt

# Create more relevant subset of "North Carlina" search

## code4lib-vote
#!/usr/bin/env ruby

# To add this to cron do something like this to use the ruby wrapper script:
# */15 8-17 * * 1-5 env DISPLAY=:0.0 /home/jnronall/.rvm/wrappers/ruby-2.1.1/ruby /home/jnronall/bin/code4lib-vote > $HOME/tmp/code4lib-vote-cron.log 2>&1

require 'httpclient'
require 'json'
require 'date'
require 'libnotify'
require 'slop'

## net_ldap_overrides.rb
class Net::LDAP

  def initialize(args = {})
    @host = args[:host] || DefaultHost
    @port = args[:port] || DefaultPort
    @verbose = false # Make this configurable with a switch on the class.
    @auth = args[:auth] || DefaultAuth
    @base = args[:base] || DefaultTreebase
    encryption args[:encryption] # may be nil

## commandline use
$ ~/.cabal/bin/pandoc --version
pandoc 1.12.3.3


$ ~/.cabal/bin/pandoc -w dzslides --standalone --self-contained ~/tmp/pandoc-poster-image-test.md > ~/tmp/pandoc-poster-image-test.html

## dzslides2pdf.rb
#! /usr/bin/env ruby

# dzslides2pdf.rb
# dzslides2pdf.rb http://localhost/presentation_root presentation.html

require 'capybara/dsl'
require 'capybara-webkit'
# require 'capybara/poltergeist'
require 'fileutils'
include Capybara::DSL

## dabblet.css
/**
 * "Google Now" Card
 */
body {
	background: #e1e1e1;
	min-height: 100%;
	margin: auto;
}
ul.gNow {
	width: 450px;

## item.json
{
  "type": [
    "http:\/\/schema.org\/Organization"
  ],
  "properties": {
    "name": [
      "Riverdale"
    ],
    "url": [
      "http:\/\/d.lib.ncsu.edu\/collections\/catalog?f%5Bnames_facet%5D%5B%5D=Riverdale"

## tesse
#!/usr/bin/env ruby

# tesse: commandline tool for looking at tesseract OCR and cleaning the output

# Besides the following gem requirements it requires the following Linux programs:
# eog: for viewing the images
# wmctrl: for resizing and positioning the image viewing window

require 'tesseract'
require 'ffi/aspell'

## get_and_process_webdatacommons_data.sh
#!/usr/bin/env bash
# These steps will take a long time to download the data set.
# First, get the list of available NQuad files to download.
wget http://webdatacommons.org/2012-08/stats/files.list

# We're only interested in the microdata set right now since that seems to be where schema.org/Book is used more. So create a file list
cat files.list | grep html-microdata > microdata_files.list

# OK, this will take a while depending on your connection. Let it run overnight.
wget -i microdata_files.list

## common_crawl_hostname_count.rb
#!/usr/bin/env ruby

# a quick, simple script to partially parse output from https://github.com/trivio/common_crawl_index/blob/master/bin/remote_read
# and output subdomains in order of count

url_counts = {}
total_urls = 0
File.readlines(ARGV[0]).each do |line|
  url = line.split(' ').first
  reverse_hostname = url.split('/').first
	# This script requires the jq utility
	# https://stedolan.github.io/jq/
	# Datasets created with twarc
	# https://github.com/DocNow/twarc

	mkdir -p NCHB2-ids
	rm NCHB2-ids/NCHB2*
	touch NCHB2-ids/NCHB2-ids-with-dupes.txt

	# Create more relevant subset of "North Carlina" search
	#!/usr/bin/env ruby

	# To add this to cron do something like this to use the ruby wrapper script:
	# /15 8-17 * 1-5 env DISPLAY=:0.0 /home/jnronall/.rvm/wrappers/ruby-2.1.1/ruby /home/jnronall/bin/code4lib-vote > $HOME/tmp/code4lib-vote-cron.log 2>&1

	require 'httpclient'
	require 'json'
	require 'date'
	require 'libnotify'
	require 'slop'
	class Net::LDAP

	def initialize(args = {})
	@host = args[:host] \|\| DefaultHost
	@port = args[:port] \|\| DefaultPort
	@verbose = false # Make this configurable with a switch on the class.
	@auth = args[:auth] \|\| DefaultAuth
	@base = args[:base] \|\| DefaultTreebase
	encryption args[:encryption] # may be nil
	$ ~/.cabal/bin/pandoc --version
	pandoc 1.12.3.3


	$ ~/.cabal/bin/pandoc -w dzslides --standalone --self-contained ~/tmp/pandoc-poster-image-test.md > ~/tmp/pandoc-poster-image-test.html
	#! /usr/bin/env ruby

	# dzslides2pdf.rb
	# dzslides2pdf.rb http://localhost/presentation_root presentation.html

	require 'capybara/dsl'
	require 'capybara-webkit'
	# require 'capybara/poltergeist'
	require 'fileutils'
	include Capybara::DSL
	/**
	* "Google Now" Card
	*/
	body {
	background: #e1e1e1;
	min-height: 100%;
	margin: auto;
	}
	ul.gNow {
	width: 450px;
	{
	"type": [
	"http:\/\/schema.org\/Organization"
	],
	"properties": {
	"name": [
	"Riverdale"
	],
	"url": [
	"http:\/\/d.lib.ncsu.edu\/collections\/catalog?f%5Bnames_facet%5D%5B%5D=Riverdale"
	#!/usr/bin/env ruby

	# tesse: commandline tool for looking at tesseract OCR and cleaning the output

	# Besides the following gem requirements it requires the following Linux programs:
	# eog: for viewing the images
	# wmctrl: for resizing and positioning the image viewing window

	require 'tesseract'
	require 'ffi/aspell'
	#!/usr/bin/env bash
	# These steps will take a long time to download the data set.
	# First, get the list of available NQuad files to download.
	wget http://webdatacommons.org/2012-08/stats/files.list

	# We're only interested in the microdata set right now since that seems to be where schema.org/Book is used more. So create a file list
	cat files.list \| grep html-microdata > microdata_files.list

	# OK, this will take a while depending on your connection. Let it run overnight.
	wget -i microdata_files.list
	#!/usr/bin/env ruby

	# a quick, simple script to partially parse output from https://github.com/trivio/common_crawl_index/blob/master/bin/remote_read
	# and output subdomains in order of count

	url_counts = {}
	total_urls = 0
	File.readlines(ARGV[0]).each do \|line\|
	url = line.split(' ').first
	reverse_hostname = url.split('/').first