Skip to content

Instantly share code, notes, and snippets.

@jakekara
Created April 11, 2019 14:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jakekara/89b4968c355c977a937820c9c345b510 to your computer and use it in GitHub Desktop.
Save jakekara/89b4968c355c977a937820c9c345b510 to your computer and use it in GitHub Desktop.
scrape all attachments from newhavenct.gov
import requests
import magic
import mimetypes
class Blob:
def __init__(self,
blob_id,
base_url="http://www.newhavenct.gov/civicax/filebank/blobdload.aspx?blobid="):
self.blob_id = blob_id
self.url = base_url + str(blob_id)
self.__content = None
def blob_exists(self):
resp = requests.head(self.url)
if resp.status_code == 200:
return True
return False
def guess_ext(self, default=".blob"):
mime = magic.from_buffer(self.content(), mime=True)
ret = mimetypes.guess_extension(mime, strict=True)
if ret is None: return default
return ret
def guess_filename(self, default=".blob"):
return str(self.blob_id) + self.guess_ext(default=default)
def content(self):
if self.__content is None:
resp = requests.get(self.url, verify=False)
if resp.status_code == 200:
self.__content = resp.content
return self.__content
import argparse
import os
from nhblob import Blob
from time import sleep
first = 30140
last = 35000
blob_ids = range(first, last)
downloaded_blob_ids = [x.split(".")[0]for x in os.listdir("blobs")]
def download_blob(i):
b = Blob(i)
content = b.content()
if content is not None:
open(os.path.join("blobs", b.guess_filename()), "wb").write(content)
sleep(1)
for i in blob_ids:
print(i)
# Skip already downloaded blobs
if str(i) in downloaded_blob_ids:
continue
try:
download_blob(i)
except:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment