Skip to content

Instantly share code, notes, and snippets.

@alekssamos
Created February 3, 2023 15:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alekssamos/d565778155fa848c477c6897eec67638 to your computer and use it in GitHub Desktop.
Save alekssamos/d565778155fa848c477c6897eec67638 to your computer and use it in GitHub Desktop.
Persistent queue with saving to disk (Python)
"""
Persistent queue
A permanent queue with saving to disk. It is restored on restart.
Only waiting new elements that have not yet been taken into operation will be restored.
Source:
Persistent Queue « Python recipes « ActiveState Code
https://code.activestate.com/recipes/579124-persistent-queue/
PERSISTENT QUEUE (PYTHON RECIPE)
FORKED FROM RECIPE 501154 (SIMPLE FILE-BASED PERSISTENT QUEUE)
A class for persistent queues.
A persistent queue is useful when the size of the queue or the items prohibits the entire queue from residing in memory.
Persistent queues operate with a directory structured as follows:
/ index 0 1 . . N
0 .. N are files that contain items, the number of items per file is configured in the queue constructor with the cache_size parameter.
The 'index' file contains pointers to the current item file for dequeue (or head of the queue) and the item file for enqueue (or tail of the queue).
Whenever the queue cache for dequeue is empty, the file containing the head of the queue by the head pointer is loaded into the cache. Whenever the queue cache for enqueue is full, the queue cache for enqueue is stored in the file pointed to by the tail pointer. The enqueue and dequeue cache can be explicitly synchronized onto stable storage by calling the sync() method.
By default, items enqueued on the queue must be serializable with the marshal module in the standard python distribution. However, you may pass an alternative module to the 'marshal' argument in the queue constructor, e.g. pickle.
The PersistentQueue does not use fsync() to flush the disk cache onto disk, but this would be a straightforward modification through subclassing.
Multiple threads can safely use the queue in the same process, but there is no external synchronization between processes. Thus, if two processes open the same queue, the queue will ultimately be corrupted if/when a process causes the queue data to be written to disk without the other process knowing.
"""
import os, sys, glob
try:
import dill as marshal
import dill.settings
from dill import FILE_FMODE
dill.settings.update({'byref': True, 'fmode': FILE_FMODE, 'recurse': True, 'ignore': True})
except ImportError:
import marshal
import _thread as thread
# Filename used for index files, must not contain numbers
INDEX_FILENAME = 'index'
# Exception thrown when calling get() on an empty queue
class Empty(Exception): pass
class PersistentQueue:
def __init__(self, name, cache_size=512, marshal=marshal):
"""
Create a persistent FIFO queue named by the 'name' argument.
The number of cached queue items at the head and tail of the queue
is determined by the optional 'cache_size' parameter. By default
the marshal module is used to (de)serialize queue items, but you
may specify an alternative serialize module/instance with the
optional 'marshal' argument (e.g. pickle).
"""
assert cache_size > 0, 'Cache size must be larger than 0'
self.name = name
self.cache_size = cache_size
self.marshal = marshal
self.index_file = os.path.join(name, INDEX_FILENAME)
self.temp_file = os.path.join(name, 'tempfile')
self.mutex = thread.allocate_lock()
self._init_index()
def _init_index(self):
if not os.path.exists(self.name):
os.mkdir(self.name)
if os.path.exists(self.index_file):
index_file = open(self.index_file)
self.head, self.tail = map(lambda x: int(x),
index_file.read().split(' '))
index_file.close()
else:
self.head, self.tail = 0, 1
def _load_cache(cache, num):
name = os.path.join(self.name, str(num))
mode = 'rb+' if os.path.exists(name) else 'wb+'
cachefile = open(name, mode)
try:
setattr(self, cache, self.marshal.load(cachefile))
except EOFError:
setattr(self, cache, [])
os.fsync(cachefile)
cachefile.close()
_load_cache('put_cache', self.tail)
_load_cache('get_cache', self.head)
assert self.head < self.tail, 'Head not less than tail'
def _sync_index(self):
assert self.head < self.tail, 'Head not less than tail'
index_file = open(self.temp_file, 'w')
index_file.write('%d %d' % (self.head, self.tail))
os.fsync(index_file)
index_file.close()
if os.path.exists(self.index_file):
os.remove(self.index_file)
os.rename(self.temp_file, self.index_file)
def _split(self):
put_file = os.path.join(self.name, str(self.tail))
temp_file = open(self.temp_file, 'wb')
self.marshal.dump(self.put_cache, temp_file)
temp_file.close()
if os.path.exists(put_file):
os.remove(put_file)
os.rename(self.temp_file, put_file)
self.tail += 1
if len(self.put_cache) <= self.cache_size:
self.put_cache = []
else:
self.put_cache = self.put_cache[:self.cache_size]
self._sync_index()
def _join(self):
current = self.head + 1
if current == self.tail:
self.get_cache = self.put_cache
self.put_cache = []
else:
get_file = open(os.path.join(self.name, str(current)), 'rb')
self.get_cache = self.marshal.load(get_file)
get_file.close()
try:
os.remove(os.path.join(self.name, str(self.head)))
except:
pass
self.head = current
if self.head == self.tail:
self.head = self.tail - 1
self._sync_index()
def _sync(self):
self._sync_index()
get_file = os.path.join(self.name, str(self.head))
temp_file = open(self.temp_file, 'wb')
self.marshal.dump(self.get_cache, temp_file)
temp_file.close()
if os.path.exists(get_file):
os.remove(get_file)
os.rename(self.temp_file, get_file)
put_file = os.path.join(self.name, str(self.tail))
temp_file = open(self.temp_file, 'wb')
self.marshal.dump(self.put_cache, temp_file)
temp_file.close()
if os.path.exists(put_file):
os.remove(put_file)
os.rename(self.temp_file, put_file)
def __len__(self):
"""
Return number of items in queue.
"""
self.mutex.acquire()
try:
return (((self.tail-self.head)-1)*self.cache_size) + \
len(self.put_cache) + len(self.get_cache)
finally:
self.mutex.release()
def sync(self):
"""
Synchronize memory caches to disk.
"""
self.mutex.acquire()
try:
self._sync()
finally:
self.mutex.release()
def put(self, obj):
"""
Put the item 'obj' on the queue.
"""
self.mutex.acquire()
try:
self.put_cache.append(obj)
if len(self.put_cache) >= self.cache_size:
self._split()
finally:
self.mutex.release()
self.sync()
def get(self):
"""
Get an item from the queue.
Throws Empty exception if the queue is empty.
"""
self.mutex.acquire()
try:
if len(self.get_cache) > 0:
return self.get_cache.pop(0)
else:
self._join()
if len(self.get_cache) > 0:
return self.get_cache.pop(0)
else:
raise Empty
finally:
self.mutex.release()
self.sync()
def close(self):
"""
Close the queue. Implicitly synchronizes memory caches to disk.
No further accesses should be made through this queue instance.
"""
self.mutex.acquire()
try:
self._sync()
if os.path.exists(self.temp_file):
try:
os.remove(self.temp_file)
except:
pass
finally:
self.mutex.release()
self.sync()
## Tests
if __name__ == "__main__":
ELEMENTS = 1000
p = PersistentQueue('test', 10)
print('Enqueueing %d items, cache size = %d' % (ELEMENTS,
p.cache_size))
for a in range(ELEMENTS):
p.put(str(a))
p.sync()
print('Queue length (using __len__):', len(p))
print('Dequeueing %d items' % (ELEMENTS/2))
for a in range(ELEMENTS/2):
p.get()
print('Queue length (using __len__):', len(p))
print('Dequeueing %d items' % (ELEMENTS/2))
for a in range(ELEMENTS/2):
p.get()
print('Queue length (using __len__):', len(p))
p.sync()
p.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment