Skip to content

Instantly share code, notes, and snippets.

@flaki
Created September 8, 2021 20:48
Show Gist options
  • Save flaki/326736cc8d7b5930e7fed2fcaf6b3896 to your computer and use it in GitHub Desktop.
Save flaki/326736cc8d7b5930e7fed2fcaf6b3896 to your computer and use it in GitHub Desktop.
Fix for Fluent serialization in translate/weblate
#
# Copyright 2021 Jack Grigg
#
# This file is part of the Translate Toolkit.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
r"""Manage the Fluent translation format.
It is a monolingual base class derived format with :class:`FluentFile`
and :class:`FluentUnit` providing file and unit level access.
"""
from fluent.syntax import FluentParser, ast, parse, serialize, visitor
from fluent.syntax.serializer import serialize_pattern
from fluent.syntax.stream import FluentParserStream
from translate.storage import base
def id_from_source(source):
# If the caller does not provide a unit ID, we need to generate one
# ourselves. A valid Fluent identifier has the following EBNF grammar:
# Identifier ::= [a-zA-Z] [a-zA-Z0-9_-]*
#
# This means we can't simply use the source string itself as the identifier
# (as e.g. PO files do). Instead, we hash the source string with a
# collision-resistant hash function.
import hashlib
return "gen-" + hashlib.sha256(source.encode()).hexdigest()
def source_from_entry(entry):
# Serialized patterns come in two forms:
# - Single-line patterns, which have a leading space we need to strip (for
# consistency with the expectations of what callers will set).
# - Multiline patterns, which have a leading newline we need to preserve.
return serialize_pattern(entry.value).lstrip(" ")
class FluentUnit(base.TranslationUnit):
"""A Fluent message."""
def __init__(self, source=None, entry=None):
super().__init__(source)
self._type = None
self._id = None
self._errors = {}
self._attributes = {}
if source is not None:
self._type = ast.Message
self._set_value(source)
# Default to source string
self._id = id_from_source(self.source)
if entry is not None:
self.parse(entry)
def _set_value(self, value):
self.source = value
# The source and target are equivalent for monolingual units.
self.target = self.source
def getid(self):
return self._id
def setid(self, value):
self._id = value
def adderror(self, errorname, errortext):
self._errors[errorname] = errortext
def geterrors(self):
return self._errors
def isheader(self):
return self._type in [
ast.GroupComment,
ast.ResourceComment,
]
def getattributes(self):
return self._attributes
def parse(self, entry):
# Handle this unit separately if it is invalid.
if isinstance(entry, ast.Junk):
for annotation in entry.annotations:
self.adderror(annotation.code, annotation.message)
self._set_value(entry.content)
return
this = self
class Parser(visitor.Visitor):
_found_id = False
def visit_Attribute(self, node):
this._attributes[node.id.name] = source_from_entry(node)
def visit_Comment(self, node):
if this._type not in [ast.Message, ast.Term]:
this._type = ast.Comment
this.addnote(node.content)
def visit_GroupComment(self, node):
this._type = ast.GroupComment
this.addnote(node.content)
def visit_ResourceComment(self, node):
this._type = ast.ResourceComment
this.addnote(node.content)
def visit_Identifier(self, node):
if not self._found_id:
# Only save the first identifier we encounter (the entry's
# value will also contain identifiers if it has selectors).
this._id = node.name
self._found_id = True
def visit_Message(self, node):
this._type = ast.Message
this._set_value(source_from_entry(node))
self.generic_visit(node)
def visit_Term(self, node):
this._type = ast.Term
this._set_value(source_from_entry(node))
self.generic_visit(node)
Parser().visit(entry)
def to_entry(self):
fp = FluentParser(False)
value = None
attributes = None
if (self.source is not None):
value = fp.maybe_get_pattern(FluentParserStream(self.source))
attributes = [
ast.Attribute(
ast.Identifier(id),
fp.maybe_get_pattern(FluentParserStream(value)),
)
for (id, value) in self._attributes.items()
]
comment = None
if self.getnotes():
comment = ast.Comment(self.getnotes())
if (self._type is ast.ResourceComment or self._type is ast.GroupComment or self._type is ast.Comment):
return (self._type)(
content=self.getnotes()
)
return (self._type if self._type is not None else ast.Message)(
ast.Identifier(self.getid()),
value=value,
attributes=attributes,
comment=comment,
)
class FluentFile(base.TranslationStore):
"""A Fluent file."""
Name = "Fluent file"
Mimetypes = []
Extensions = ["ftl"]
UnitClass = FluentUnit
def __init__(self, inputfile=None, **kwargs):
super().__init__(**kwargs)
self.filename = getattr(inputfile, "name", "")
if inputfile is not None:
fluentsrc = inputfile.read()
self.parse(fluentsrc)
def parse(self, fluentsrc):
resource = parse(fluentsrc.decode("utf-8"))
for entry in resource.body:
self.addunit(FluentUnit(entry=entry))
def serialize(self, out):
body = [unit.to_entry() for unit in self.units]
out.write(serialize(ast.Resource(body)).encode(self.encoding))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment