John DeBovis debovis

## snowflake_copy_database_by_table.py
# Useful when you want to make a full copy of a Snowflake Database that is shared with your account
# into a database in your account, but you can't use clone database / etc.
#!/usr/bin/env python
import os
import re
# https://docs.snowflake.com/en/user-guide/python-connector.html
import snowflake.connector


# Connect to snowflake

## PyPDF2Highlight.py
from PyPDF2.generic import (
    DictionaryObject,
    NumberObject,
    FloatObject,
    NameObject,
    TextStringObject,
    ArrayObject
)

# x1, y1 starts in bottom left corner

## fetch-timeout.js
Promise.race([
  fetch('/foo'),
  new Promise((_, reject) =>
    setTimeout(() => reject(new Error('Timeout')), 7000)
  )
]);

## weightedavg.py
# http://times.cs.uiuc.edu/~wang296/Data/ tripadvisor dataset
import json
import numpy as np
import os
path = '/place/i/have/lots/of/json'
np.seterr(divide='ignore', invalid='ignore')
np.set_printoptions(nanstr="0")
#loop over all files in dir
for filename in os.listdir(path):
    with open(path+filename) as json_file:

## README.md

      
              2 files
            
          
              69 forks
            
          
              9 comments
            
          
              406 stars
            
          
                dannguyen
                / README.md
            
            
              Last active
              July 6, 2024 16:36
            
              
                Using Python 3.x and Google Cloud Vision API to OCR scanned documents to extract structured data
              
          
    Using Python 3 + Google Cloud Vision API's OCR to extract text from photos and scanned documents

Just a quickie test in Python 3 (using Requests) to see if Google Cloud Vision can be used to effectively OCR a scanned data table and preserve its structure, in the way that products such as ABBYY FineReader can OCR an image and provide Excel-ready output.
The short answer: No. While Cloud Vision provides bounding polygon coordinates in its output, it doesn't provide it at the word or region level, which would be needed to then calculate the data delimiters.
On the other hand, the OCR quality is pretty good, if you just need to identify text anywhere in an image, without regards to its physical coordinates. I've included two examples:
####### 1. A low-resolution photo of road signs

  
## gist:9672772a631c117da151

      
              1 file
            
          
              17 forks
            
          
              56 comments
            
          
              77 stars
            
          
                jvenator
                / gist:9672772a631c117da151
            
            
              Last active
              June 14, 2024 13:31
            
              
                PDFtk Server Install Workaround for Mac OS X
              
          
    Installing PDFtk Server edittion on your Mac

This workaround install is necessary because PDFtk was pulled from homebrew-cask due to issues with it aggressively overwriting file permissions that could impact other installed libraries. See this homebrew-cask issue.

The following steps worked on Mac OS X 10.10.1 with a standard brew installation for the PDFtk Mac OS X server libary version 2.02.

All Terminal commands separated by a full line space. Some commands wrap into multiple lines.

Download and extract the Mac OS X server install pacakge


## es_features.py
import elasticsearch
from math import log


def tfidf_matrix(es, index, doc_type, fields, size=10, bulk=500, query=dict(match_all=[])):
    """Generate tfidf for `size` documents of `index`/`doc_type`.
    All `fields` need to have the mapping "term_vector": "yes".
    This is the consuming version (i.e. get everything at once).

    :param es: elasticsearch client

## gist:862675ec1b7bccabc311
docker rmi $(docker images -q -f dangling=true)

## remove-docker-containers.md

      
              1 file
            
          
              66 forks
            
          
              43 comments
            
          
              261 stars
            
          
                ngpestelos
                / remove-docker-containers.md
            
            
              Last active
              May 31, 2024 15:10
            
              
                How to remove unused Docker containers and images
              
          
    May 8, 2018

I wrote this four years ago, so instead use this command:
$ docker rmi $(docker images -q -f dangling=true)


## hattrie_vectorizer.py
import numpy as np
import scipy.sparse as sp
import hat_trie
from sklearn.feature_extraction.text import CountVectorizer, _make_int_array


class HatTrieCountVectorizer(CountVectorizer):

    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
	# Useful when you want to make a full copy of a Snowflake Database that is shared with your account
	# into a database in your account, but you can't use clone database / etc.
	#!/usr/bin/env python
	import os
	import re
	# https://docs.snowflake.com/en/user-guide/python-connector.html
	import snowflake.connector


	# Connect to snowflake
	from PyPDF2.generic import (
	DictionaryObject,
	NumberObject,
	FloatObject,
	NameObject,
	TextStringObject,
	ArrayObject
	)

	# x1, y1 starts in bottom left corner
	Promise.race([
	fetch('/foo'),
	new Promise((_, reject) =>
	setTimeout(() => reject(new Error('Timeout')), 7000)
	)
	]);
	# http://times.cs.uiuc.edu/~wang296/Data/ tripadvisor dataset
	import json
	import numpy as np
	import os
	path = '/place/i/have/lots/of/json'
	np.seterr(divide='ignore', invalid='ignore')
	np.set_printoptions(nanstr="0")
	#loop over all files in dir
	for filename in os.listdir(path):
	with open(path+filename) as json_file:
	import elasticsearch
	from math import log


	def tfidf_matrix(es, index, doc_type, fields, size=10, bulk=500, query=dict(match_all=[])):
	"""Generate tfidf for `size` documents of `index`/`doc_type`.
	All `fields` need to have the mapping "term_vector": "yes".
	This is the consuming version (i.e. get everything at once).

	:param es: elasticsearch client
	import numpy as np
	import scipy.sparse as sp
	import hat_trie
	from sklearn.feature_extraction.text import CountVectorizer, _make_int_array


	class HatTrieCountVectorizer(CountVectorizer):

	def _count_vocab(self, raw_documents, fixed_vocab):
	"""Create sparse feature matrix, and vocabulary where fixed_vocab=False