Skip to content

Instantly share code, notes, and snippets.

@dctanner
Created March 26, 2024 13:53
Show Gist options
  • Save dctanner/1a91df5e71eddd227f8a0ce6c6308419 to your computer and use it in GitHub Desktop.
Save dctanner/1a91df5e71eddd227f8a0ce6c6308419 to your computer and use it in GitHub Desktop.
Bulk download UseScraper.com Crawler results
import os
import sys
example_usage = '''
Usage:
USESCRAPER_API_KEY=<api_key> python3 download-job-data.py <job_id>
'''
api_key = os.environ['USESCRAPER_API_KEY']
if api_key is None:
print(f'''
Please set the USESCRAPER_API_KEY environment variable to your API key.
You can view your API key at <https://app.usescraper.com/settings>.
{example_usage}
''', file=sys.stderr)
exit(1)
job_id = sys.argv[1] if len(sys.argv) > 1 else None
if job_id is None:
print(f'''
Please provide a job ID as an argument.
{example_usage}
''', file=sys.stderr)
exit(1)
import requests
import json
cursor = None
with open(f'{job_id}.jsonl', 'w') as file:
while True:
response = requests.get(
f'https://api.staging.usescraper.com/crawler/jobs/{job_id}/data',
headers={
'Authorization': f'Bearer {api_key}',
},
params={
'limit': 400,
'cursor': cursor,
},
)
response.raise_for_status()
data = response.json()
for item in data['data']:
file.write(json.dumps(item) + '\n')
file.flush()
cursor = data.get('cursor', None)
print(cursor)
if cursor is None:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment