Daniel Pett portableant

## splitPDFandOCR.py
#!/usr/bin/python
## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
## Daniel Pett 11/2/2021
__author__ = 'portableant'
## Tested on Python 2.7.16
## Usage example
## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

import argparse

## gist:2d67f707e7997271acc42857f0c6044d
<div class="card col-sm p-0 m-2">
    <div class="embed-responsive embed-responsive-16by9">
        <iframe src="https://www.youtube.com/embed/{{ video.id }}" frameborder="0"
                            allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture"
                            allowfullscreen class="embed-responsive-item"></iframe>
    </div>
 </div>


## howtouse.txt
Install the PIL library on windows at the powershell prompt by typing the command below

pip3 install PIL

1. Now save your images into a folder on your computer and make a note of the path (don't work on your master images, use a copied folder)
2. Save the stitch.py file from here in the same directory
3. Remove all the images with the scale bar included as you don't need these
4. Now edit the stitch.py file and replace line 14 with the path to the directory where you have your lenborough images
5. Now go back to powershell and type:

## splitNile.py
#!/usr/bin/python
## Split audio files into chunks
## Daniel Pett 1/5/2020
__author__ = 'portableant'
## Tested on Python 2.7.16 - yes I know I need to upgrade.

import argparse
import os
import speech_recognition as sr
import csv

## transcribedAudio.csv
"taskID","transcription","valid","comments","userID","inputBy","track"
91146,"","yes","",1,"Daniel Pett; Jeff Okazaki","https://fitz-audio-guide-micropasts.s3-eu-west-2.amazonaws.com/602_The_Nativity_Chunk0.mp3"
91146,"Domenico Ghirlandaio ran one of the biggest and most successful artist workshops in Florence towards the end of the Fifteenth century and it used to be thought that this","no","",,"Daniel Pett; Jeff Okazaki","https://fitz-audio-guide-micropasts.s3-eu-west-2.amazonaws.com/602_The_Nativity_Chunk0.mp3"
91146,"Domenico Ghirlandaio ran one of the biggest and most successful artist workshops in Florence towards the end of the Fifteenth Century and it used to be thought that this...","no","",243,"Daniel Pett; Jeff Okazaki","https://fitz-audio-guide-micropasts.s3-eu-west-2.amazonaws.com/602_The_Nativity_Chunk0.mp3"
,,,,,,
91147,"nativity scene was produced by a junior member under his supervision but cleaning in the 1990s revealed just how good some of the painting here is","yes","",243,"Daniel Pett; J

## splitAudio.py

#!/usr/bin/python
## Split audio files into chunks
## Daniel Pett 1/5/2020
__author__ = 'portableant'
## Tested on Python 2.7.16 - yes I know I need to upgrade.

import argparse
import os
import speech_recognition as sr

## splitAudio.py

#!/usr/bin/python
## Split audio files into chunks
## Daniel Pett 1/5/2020
__author__ = 'portableant'
## Tested on Python 2.7.13

import argparse
import os
import speech_recognition as sr

## gist:797812d094b4ca559b4b78e918d03a78
import speech_recognition as sr
filename = "2101.wav"
r = sr.Recognizer()
with sr.AudioFile(filename) as source:
    # listen for the data (load audio to memory)
    audio_data = r.record(source)
    # recognize (convert from speech to text)
    text = r.recognize_google(audio_data)
    print(text)

## scrapetosolr.py
import sys
from bs4 import BeautifulSoup
import solr
import hashlib
import urllib.request
import xml.etree.ElementTree as ET

limit = 0 # How many iterations max?  Enter 0 for no limit.
solrUrl = 'solrURL' # The URL of the solr instance
sitemaps_ns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # The xmlns for the sitemap schema

## workshop.geojson

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                portableant
                / workshop.geojson
            
            
              Last active
              February 27, 2020 16:06
            
              
                AHRC
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	#!/usr/bin/python
	## Split pdf files into pages and ocr text (this is a bit honky, but works as a demo)
	## Daniel Pett 11/2/2021
	__author__ = 'portableant'
	## Tested on Python 2.7.16
	## Usage example
	## python3 splitPdf.py -p . -f 1975_1989.pdf -d processed -n 1975_1989_processed -o ocr
	## mac osx brew install poplar and echo 'export PATH="/usr/local/opt/qt/bin:$PATH"' >> ~/.zshrc

	import argparse
	<div class="card col-sm p-0 m-2">
	<div class="embed-responsive embed-responsive-16by9">
	<iframe src="https://www.youtube.com/embed/{{ video.id }}" frameborder="0"
	allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture"
	allowfullscreen class="embed-responsive-item"></iframe>
	</div>
	</div>
	Install the PIL library on windows at the powershell prompt by typing the command below

	pip3 install PIL

	1. Now save your images into a folder on your computer and make a note of the path (don't work on your master images, use a copied folder)
	2. Save the stitch.py file from here in the same directory
	3. Remove all the images with the scale bar included as you don't need these
	4. Now edit the stitch.py file and replace line 14 with the path to the directory where you have your lenborough images
	5. Now go back to powershell and type:
	#!/usr/bin/python
	## Split audio files into chunks
	## Daniel Pett 1/5/2020
	__author__ = 'portableant'
	## Tested on Python 2.7.16 - yes I know I need to upgrade.

	import argparse
	import os
	import speech_recognition as sr
	import csv
	"taskID","transcription","valid","comments","userID","inputBy","track"
	91146,"","yes","",1,"Daniel Pett; Jeff Okazaki","https://fitz-audio-guide-micropasts.s3-eu-west-2.amazonaws.com/602_The_Nativity_Chunk0.mp3"
	91146,"Domenico Ghirlandaio ran one of the biggest and most successful artist workshops in Florence towards the end of the Fifteenth century and it used to be thought that this","no","",,"Daniel Pett; Jeff Okazaki","https://fitz-audio-guide-micropasts.s3-eu-west-2.amazonaws.com/602_The_Nativity_Chunk0.mp3"
	91146,"Domenico Ghirlandaio ran one of the biggest and most successful artist workshops in Florence towards the end of the Fifteenth Century and it used to be thought that this...","no","",243,"Daniel Pett; Jeff Okazaki","https://fitz-audio-guide-micropasts.s3-eu-west-2.amazonaws.com/602_The_Nativity_Chunk0.mp3"
	,,,,,,
	91147,"nativity scene was produced by a junior member under his supervision but cleaning in the 1990s revealed just how good some of the painting here is","yes","",243,"Daniel Pett; J
	import speech_recognition as sr
	filename = "2101.wav"
	r = sr.Recognizer()
	with sr.AudioFile(filename) as source:
	# listen for the data (load audio to memory)
	audio_data = r.record(source)
	# recognize (convert from speech to text)
	text = r.recognize_google(audio_data)
	print(text)
	import sys
	from bs4 import BeautifulSoup
	import solr
	import hashlib
	import urllib.request
	import xml.etree.ElementTree as ET

	limit = 0 # How many iterations max? Enter 0 for no limit.
	solrUrl = 'solrURL' # The URL of the solr instance
	sitemaps_ns = 'http://www.sitemaps.org/schemas/sitemap/0.9' # The xmlns for the sitemap schema