nblmc/identifier-puller.md

## identifier-puller.md

      
    Raw
  

              identifier-puller.md
            
          
    identifier-puller

Pulls down Digital Commonwealth identifiers from a search URL and prints to stdout.
Usage

Grab a URL generated from the DC search portal, and pass as the first argument to the script. Pipe the output to whatever file you want.
Example

python3 identifier-puller.py "https://collections.leventhalmap.org/search?utf8=✓&q=bromley" > bromleys.txt

  
## identifier-puller.py
#!/usr/bin/env python3

import requests
import json
import argparse


def processResultsPage(resultsPageJSON):
	identifiers = []

	for doc in resultsPageJSON['response']['docs']:
		identifiers.append(doc['id'])

	if not resultsPageJSON['response']['pages']['last_page?']:

		p = resultsPageJSON['response']['pages']['next_page']
		r = requests.get('{}{}{}'.format( searchURL.replace("/search?","/search.json?"), "&per_page=500&page=", p))

		processResultsPage( r.json() )

	else:
		pass

	return identifiers


def main():
	parser = argparse.ArgumentParser(description='pull identifiers for an lmec digital collections search')
	parser.add_argument('searchURL', metavar='SEARCH_URL', type=str, help='Fully qualified search url, e.g. https://collections.leventhalmap.org/search?utf8=✓&q=bromley')

	args = parser.parse_args()

	identifiersList = []
	searchURL = args.searchURL

	r = requests.get('{}{}'.format( searchURL.replace("/search?","/search.json?"), "&per_page=500"))

	identifiers = processResultsPage( r.json() )

	for identifier in identifiers:
		print(identifier)


if __name__ == '__main__':
	main()
	#!/usr/bin/env python3

	import requests
	import json
	import argparse


	def processResultsPage(resultsPageJSON):
	identifiers = []

	for doc in resultsPageJSON['response']['docs']:
	identifiers.append(doc['id'])

	if not resultsPageJSON['response']['pages']['last_page?']:

	p = resultsPageJSON['response']['pages']['next_page']
	r = requests.get('{}{}{}'.format( searchURL.replace("/search?","/search.json?"), "&per_page=500&page=", p))

	processResultsPage( r.json() )

	else:
	pass

	return identifiers


	def main():
	parser = argparse.ArgumentParser(description='pull identifiers for an lmec digital collections search')
	parser.add_argument('searchURL', metavar='SEARCH_URL', type=str, help='Fully qualified search url, e.g. https://collections.leventhalmap.org/search?utf8=✓&q=bromley')

	args = parser.parse_args()

	identifiersList = []
	searchURL = args.searchURL

	r = requests.get('{}{}'.format( searchURL.replace("/search?","/search.json?"), "&per_page=500"))

	identifiers = processResultsPage( r.json() )

	for identifier in identifiers:
	print(identifier)


	if __name__ == '__main__':
	main()