Skip to content

Instantly share code, notes, and snippets.

@cnk
Created August 2, 2023 17:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cnk/df118f051b5d1ebbaa221b707078f4a3 to your computer and use it in GitHub Desktop.
Save cnk/df118f051b5d1ebbaa221b707078f4a3 to your computer and use it in GitHub Desktop.
Import documents from file + yml data
import os
import requests
from io import BytesIO
from collections import OrderedDict
from django.core.files import File
from wagtail.models import Collection
from wagtail.documents import get_document_model
from djunk.utils import get_or_generate
from core.logging import logger
class DocumentMigrator(object):
def __get_doc_data(self, yml):
if yml.get('file', None):
# This import is dealing with files in the local file system
return (open(os.path.join(os.getcwd(), 'documents', yml['file']), 'rb'), yml['file'])
elif yml.get('url', None):
# This import is getting urls from which we can upload the docs, retrieve the actual doc content
response = requests.get(yml['url'])
return (BytesIO(response.content), yml['url'].split('/')[-1])
else:
raise RuntimeError('We need either a local file or a url from which we can retrieve the file.')
def __file_needs_update(self, doc, yml):
# figure out if we need to create new file object or not
# If there isn't a current file, we definitely need to update
if not doc.file:
return True
# Otherwise, check if the exported file hash equals the one recorded in the database
return yml.get('file_hash', None) != doc.get_file_hash()
def create(self, site_helper, yml, dry_run=False):
"""
documents:
- id: 2
collection_name: Clip Art
title_text: foobar
file: foobar.jpg
tags:
- tag1
- tag2
"""
doc = None
if not dry_run:
doc, created = get_or_generate(get_document_model(), import_id=site_helper.import_id(yml['id']))
doc.title = yml.get('title_text', yml.get('filename', '').replace('%20', ' '))
if yml.get('collection_name', None):
other_collection = Collection.objects.descendant_of(site_helper.collection).filter(name=yml['collection_name']).first()
doc.collection = other_collection
else:
doc.collection = site_helper.collection
if created or self.__file_needs_update(doc, yml):
doc_data, filename = self.__get_doc_data(yml)
doc.file = File(doc_data, name=filename.replace('%20', ' '))
doc.file_size = yml.get('filesize') or len(doc.file)
doc.file_hash = yml.get('file_hash', '')
doc.save()
doc.get_file_hash()
if yml.get('tags', None):
doc.tags.set(yml['tags'])
op = 'update'
if created:
op = 'create'
logger.info(
'importer.document.{}'.format(op),
file=yml.get('file', yml.get('url', 'NO FILE?!'))
)
else:
logger.info(
'importer.document.create.dry-run',
file=yml.get('file', yml.get('url', 'NO FILE?!'))
)
return doc
def export(self, doc):
doc_data = OrderedDict([
('id', doc.id),
('file', doc.filename),
('filesize', doc.file_size),
('file_hash', doc.file_hash),
('title_text', doc.title),
])
# if this is in a collection other than the default collection, export collection name
if doc.collection.depth > 2:
doc_data['collection_name'] = doc.collection.name
if doc.tags.exists():
doc_data['tags'] = [tag.name for tag in doc.tags.all()]
return doc_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment