Created
September 1, 2023 14:26
-
-
Save DrDanL/ef0d4b84880436d6df08106dc7e455f8 to your computer and use it in GitHub Desktop.
Query and download data from Google Firestore using Python and firebase_admin
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import firebase_admin | |
from firebase_admin import credentials, firestore | |
# set the root folder path | |
base_url = '<BASE URL HERE>' | |
# used for paging when downloading data e.g. only 1000 documents downloaded per call | |
limit = 1000 | |
# create the connection | |
if not firebase_admin._apps: | |
print('Setting connection') | |
# use the service key to authorise the login | |
cred = credentials.Certificate('ServiceAccountKey.json') | |
# ensure we have the storage bucket permission. | |
# for some reason Firebase wants this even if you don't want to pull the data | |
default_app = firebase_admin.initialize_app(cred, { | |
'storageBucket': '<STORAGE BUCKET' | |
}) | |
# start the firestore client | |
store = firestore.client() | |
# declare a function to stream the documents to download | |
def stream_collection_loop(collection, count, cursor=None): | |
dict_array = [] | |
id_array = [] | |
while True: | |
docs = [] # Very important. This frees the memory incurred in the recursion algorithm. | |
if cursor: | |
docs = [snapshot for snapshot in | |
collection.limit(limit).order_by('__name__').start_after(cursor).stream()] | |
else: | |
docs = [snapshot for snapshot in collection.limit(limit).order_by('__name__').stream()] | |
for doc in docs: | |
dict_array.append(doc.to_dict()) | |
id_array.append(doc.id) | |
if len(docs) == limit: | |
cursor = docs[limit-1] | |
continue | |
break | |
return dict_array, id_array | |
# example data downloading and streaming | |
# use the defined function and query the 'users' table with all its data, starting from index 0 | |
dict_array, id_array = stream_collection_loop(store.collection(u'users'), 0) | |
# once the query has finished we now have the dict_array and id_array | |
# dict_array is the data stored within the firesotre array | |
# id_array is the document ids often known as doc_id | |
# now this is where we can start to manage and process the data using pandas | |
# we create a panda view with the data, and document ids | |
df = pd.DataFrame(data=dict_array, index=id_array) | |
# to make sure we have the right format and references I always rename the axis to ensure doc_id is referenced | |
df.rename_axis("doc_id", inplace=True) | |
# the data has now been downloaded and can be viewed, saved and processed as needed | |
display(df.head(5)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment