Jason Best jbest

## Ruby Notepad Bookmarklet
data:text/html, <style type="text/css">#e{position:absolute;top:0;right:0;bottom:0;left:0;}</style><div id="e"></div><script src="http://d1n0x3qji82z53.cloudfront.net/src-min-noconflict/ace.js" type="text/javascript" charset="utf-8"></script><script>var e=ace.edit("e");e.setTheme("ace/theme/monokai");e.getSession().setMode("ace/mode/ruby");</script>

## training.sh
#! /bin/bash

# build the environment
mkdir tessenv; cd tessenv
TROOT=`pwd`
mkdir $TROOT/stockfonts; mkdir $TROOT/build; mkdir $TROOT/build/eng
echo "Environment built"
# Get the stock english fonts from Google (old, but they work)
cd $TROOT/stockfonts
GET http://tesseract-ocr.googlecode.com/files/boxtiff-2.01.eng.tar.gz > boxtiff-2.01.eng.tar.gz

## readinglisturls.py
#!/usr/bin/env python
import plistlib
from shutil import copy
import subprocess
import os
from tempfile import gettempdir
import sys
import atexit

BOOKMARKS_PLIST = '~/Library/Safari/Bookmarks.plist'

## beautiful_idiomatic_python.md

      
              1 file
            
          
              280 forks
            
          
              29 comments
            
          
              847 stars
            
          
                JeffPaine
                / beautiful_idiomatic_python.md
            
            
              Last active
              January 14, 2023 05:37
            
              
                Moved to: https://github.com/JeffPaine/beautiful_idiomatic_python
              
          
    Moved

Now located at https://github.com/JeffPaine/beautiful_idiomatic_python.
Why it was moved

Github gists don't support Pull Requests or any notifications, which made it impossible for me to maintain this (surprisingly popular) gist with fixes, respond to comments and so on. In the interest of maintaining the quality of this resource for others, I've moved it to a proper repo. Cheers!

  
## OpenRefine-column-addition-OpenTree-TNRS-match_names
import urllib
import urllib2

try:
  # Base url for the TNRS match_names API
  url = 'http://api.opentreeoflife.org/v2/tnrs/match_names'
  # Encode data value to be looked up as an array of names:
  data = '{"names": ["'+value+'"]}'
  print "Looking up: " + data
  # Set HTTP headers:

## gist:9b2de3749d687fdbff3f
function performLogin(email, password) {
  var payload = {
    "username" : email,
    "password" : password
  };

  var options = {
    "method"  : "post",
    "payload" : payload
  };

## README.md

      
              2 files
            
          
              69 forks
            
          
              9 comments
            
          
              406 stars
            
          
                dannguyen
                / README.md
            
            
              Last active
              July 6, 2024 16:36
            
              
                Using Python 3.x and Google Cloud Vision API to OCR scanned documents to extract structured data
              
          
    Using Python 3 + Google Cloud Vision API's OCR to extract text from photos and scanned documents

Just a quickie test in Python 3 (using Requests) to see if Google Cloud Vision can be used to effectively OCR a scanned data table and preserve its structure, in the way that products such as ABBYY FineReader can OCR an image and provide Excel-ready output.
The short answer: No. While Cloud Vision provides bounding polygon coordinates in its output, it doesn't provide it at the word or region level, which would be needed to then calculate the data delimiters.
On the other hand, the OCR quality is pretty good, if you just need to identify text anywhere in an image, without regards to its physical coordinates. I've included two examples:
####### 1. A low-resolution photo of road signs

  
## stringart.py
import collections
import math
import os
import cv2
import numpy as np
import time

MAX_LINES = 4000
N_PINS = 36*8
MIN_LOOP = 20               # To avoid getting stuck in a loop

## image_classifier.py
from os import listdir, rename
from os.path import isfile, join
import tkinter as tk
from PIL import ImageTk, Image

IMAGE_FOLDER = "./images/unclassified"

images = [f for f in listdir(IMAGE_FOLDER) if isfile(join(IMAGE_FOLDER, f))]
unclassified_images = filter(lambda image: not (image.startswith("0_") or image.startswith("1_")), images)
current = None

## captions-shift.py
#!/usr/bin/env python3

import webvtt
from datetime import datetime, timedelta
import argparse

parser = argparse.ArgumentParser(description="Shift caption start \
				     and end times in a .vtt file")
parser.add_argument("inputfile", help="input filename, must be VTT format")
parser.add_argument("outputfile", help="output filename")
	#! /bin/bash

	# build the environment
	mkdir tessenv; cd tessenv
	TROOT=`pwd`
	mkdir $TROOT/stockfonts; mkdir $TROOT/build; mkdir $TROOT/build/eng
	echo "Environment built"
	# Get the stock english fonts from Google (old, but they work)
	cd $TROOT/stockfonts
	GET http://tesseract-ocr.googlecode.com/files/boxtiff-2.01.eng.tar.gz > boxtiff-2.01.eng.tar.gz
	#!/usr/bin/env python
	import plistlib
	from shutil import copy
	import subprocess
	import os
	from tempfile import gettempdir
	import sys
	import atexit

	BOOKMARKS_PLIST = '~/Library/Safari/Bookmarks.plist'
	import urllib
	import urllib2

	try:
	# Base url for the TNRS match_names API
	url = 'http://api.opentreeoflife.org/v2/tnrs/match_names'
	# Encode data value to be looked up as an array of names:
	data = '{"names": ["'+value+'"]}'
	print "Looking up: " + data
	# Set HTTP headers:
	function performLogin(email, password) {
	var payload = {
	"username" : email,
	"password" : password
	};

	var options = {
	"method" : "post",
	"payload" : payload
	};
	import collections
	import math
	import os
	import cv2
	import numpy as np
	import time

	MAX_LINES = 4000
	N_PINS = 36*8
	MIN_LOOP = 20 # To avoid getting stuck in a loop
	from os import listdir, rename
	from os.path import isfile, join
	import tkinter as tk
	from PIL import ImageTk, Image

	IMAGE_FOLDER = "./images/unclassified"

	images = [f for f in listdir(IMAGE_FOLDER) if isfile(join(IMAGE_FOLDER, f))]
	unclassified_images = filter(lambda image: not (image.startswith("0_") or image.startswith("1_")), images)
	current = None
	#!/usr/bin/env python3

	import webvtt
	from datetime import datetime, timedelta
	import argparse

	parser = argparse.ArgumentParser(description="Shift caption start \
	and end times in a .vtt file")
	parser.add_argument("inputfile", help="input filename, must be VTT format")
	parser.add_argument("outputfile", help="output filename")