Skip to content

Instantly share code, notes, and snippets.

@peterk
Last active June 6, 2016 05:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save peterk/f960e61291e0e81d4bb3073485151aef to your computer and use it in GitHub Desktop.
Save peterk/f960e61291e0e81d4bb3073485151aef to your computer and use it in GitHub Desktop.
Draft script to prepare Suecia images for Wikimedia commons upload
import re
import requests
from lxml import html
from lxml.builder import E
from lxml.etree import tostring
url = "https://data.kb.se/datasets/2014/10/suecia/"
template = "{{Kungliga biblioteket image|libris-id=%s|url=%s}}"
def getmeta(libris_id):
lurl = "http://libris.kb.se/xsearch/?query=onr:%s&format=json" % libris_id
r = requests.get(lurl)
if r.status_code == 200:
return r.json()
r = requests.get(url)
tree = html.fromstring(r.content)
linkels = tree.xpath("//a[contains(@href, '.tif')]/@href")
image_urls = map(lambda el: {"url": url + el[2:], "id": el.split("%")[0].replace(".","").replace("/","")}, linkels)
def getdesc(meta, img):
desc = meta["xsearch"]["list"][0]["title"].replace(" [Elektronisk resurs]","")
if "creator" in meta["xsearch"]["list"][0]:
desc += " by " + meta["xsearch"]["list"][0]["creator"] + "."
desc += "\n\n{{Kungliga biblioteket image|libris-id=%s}}\n" % img["id"]
return cleanbibblo(desc)
def getdate(meta):
if "date" in meta["xsearch"]["list"][0]:
return meta["xsearch"]["list"][0]["date"]
else:
return "" #unknown template?
def cleanbibblo(text):
"""Remove bibliographic notation in []"""
return re.sub(r' \[[^\]]*\]', '',text)
def filedata():
filesxml = []
for img in image_urls:
meta = getmeta(img["id"])
filesxml.append(E.record(
E.source(img["url"]),
E.title(cleanbibblo(meta["xsearch"]["list"][0]["title"])),
E.filename(img["url"].replace(url,"").replace("/","").replace("%2C","_")),
E.description(getdesc(meta, img)),
E.date(getdate(meta))
))
return filesxml
xml = E.metadata(
E.records(
*filedata()
)
)
print tostring(xml, pretty_print=True, xml_declaration=True, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment