Skip to content

Instantly share code, notes, and snippets.

@DrDanL
Created September 1, 2023 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DrDanL/ef0d4b84880436d6df08106dc7e455f8 to your computer and use it in GitHub Desktop.
Save DrDanL/ef0d4b84880436d6df08106dc7e455f8 to your computer and use it in GitHub Desktop.
Query and download data from Google Firestore using Python and firebase_admin
import pandas as pd
import firebase_admin
from firebase_admin import credentials, firestore
# set the root folder path
base_url = '<BASE URL HERE>'
# used for paging when downloading data e.g. only 1000 documents downloaded per call
limit = 1000
# create the connection
if not firebase_admin._apps:
print('Setting connection')
# use the service key to authorise the login
cred = credentials.Certificate('ServiceAccountKey.json')
# ensure we have the storage bucket permission.
# for some reason Firebase wants this even if you don't want to pull the data
default_app = firebase_admin.initialize_app(cred, {
'storageBucket': '<STORAGE BUCKET'
})
# start the firestore client
store = firestore.client()
# declare a function to stream the documents to download
def stream_collection_loop(collection, count, cursor=None):
dict_array = []
id_array = []
while True:
docs = [] # Very important. This frees the memory incurred in the recursion algorithm.
if cursor:
docs = [snapshot for snapshot in
collection.limit(limit).order_by('__name__').start_after(cursor).stream()]
else:
docs = [snapshot for snapshot in collection.limit(limit).order_by('__name__').stream()]
for doc in docs:
dict_array.append(doc.to_dict())
id_array.append(doc.id)
if len(docs) == limit:
cursor = docs[limit-1]
continue
break
return dict_array, id_array
# example data downloading and streaming
# use the defined function and query the 'users' table with all its data, starting from index 0
dict_array, id_array = stream_collection_loop(store.collection(u'users'), 0)
# once the query has finished we now have the dict_array and id_array
# dict_array is the data stored within the firesotre array
# id_array is the document ids often known as doc_id
# now this is where we can start to manage and process the data using pandas
# we create a panda view with the data, and document ids
df = pd.DataFrame(data=dict_array, index=id_array)
# to make sure we have the right format and references I always rename the axis to ensure doc_id is referenced
df.rename_axis("doc_id", inplace=True)
# the data has now been downloaded and can be viewed, saved and processed as needed
display(df.head(5))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment