Skip to content

Instantly share code, notes, and snippets.

@rafbarr
Created May 17, 2016 19:47
Show Gist options
  • Save rafbarr/a2b7fd7d41b6f45305b7cabea165563c to your computer and use it in GitHub Desktop.
Save rafbarr/a2b7fd7d41b6f45305b7cabea165563c to your computer and use it in GitHub Desktop.
#!/bin/bash
//usr/bin/env groovy -cp "$(dirname "$0")" "$0" $@; exit $?
if (args.size() == 0) {
println 'missing root dir'
System.exit(1)
}
def linter = new AvroSchemaLinter()
try {
def lintErrors = AvroHelpers
.loadSchemasFromFiles(new File(args[0]))
.collectEntries { k, v -> [k, linter.lint(v)] }
.findAll { it.value }
if (lintErrors) {
println(
lintErrors.collect { k, v ->
"file '$k':\n${v.prettyPrint(4, 4)}"
}.join('\n\n')
)
System.exit(1)
} else {
println "all looks good!"
}
} catch(e) {
println e.message
System.exit(1)
}
@Grab(group = 'org.apache.avro', module = 'avro', version = '1.8.0')
@Grab(group = 'org.yaml', module = 'snakeyaml', version = '1.17')
import groovy.io.FileType
import groovy.json.JsonOutput
import groovy.transform.stc.ClosureParams
import groovy.transform.stc.FirstParam
import org.apache.avro.Schema
import org.yaml.snakeyaml.Yaml
import org.yaml.snakeyaml.error.YAMLException
class AvroHelpers {
/**
* Parses a set of raw schemas resolving any names.
*
* A raw schema is anything that can be transformed into a valid Avro schema string. For
* instance, a {@link File} instance holding an Avro schema could be converted by simply reading
* its content.
*
* @param <T> the type of the raw schema
* @param rawSchemas a set of raw schemas
* @param schemaAdaptor a closure mapping a raw schema into a valid Avro schema string
* @return a mapping from the raw schema to the parsed schema
*/
static <T> Map<T, Schema> parse(
Set<T> rawSchemas,
@ClosureParams(value=FirstParam.class) Closure<String> schemaAdaptor = Closure.IDENTITY
) {
def remainingRawSchemas = rawSchemas.collect() // defensive copy
def schemasByRawSchema = [:]
def schemasByName = [:]
while (!remainingRawSchemas.isEmpty()) {
def undefinedNames = new HashSet()
remainingRawSchemas.removeAll {
def rawSchemaProcessed = false
// bootstrap the parser with previously parsed named schemas
def schemaParser = new Schema.Parser().addTypes(schemasByName)
try {
schemasByRawSchema[it] = schemaParser.parse(schemaAdaptor(it))
schemasByName = schemaParser.types
rawSchemaProcessed = true
} catch (e) {
def undefinedNameMatcher = e.message =~ ~/"(.+)" is not a defined name.*/
if (undefinedNameMatcher.matches()) {
undefinedNames << undefinedNameMatcher[0][1]
} else {
throw new RuntimeException(
"parsing error '${e.message}' for schema '$it'"
)
}
}
rawSchemaProcessed
}
// if none of the undefined names can be resolved in the next pass, throw an exception
if (undefinedNames && undefinedNames.intersect(schemasByName.keySet()).isEmpty()) {
throw new RuntimeException(
"couldn't resolve names: ${undefinedNames.collect { "'$it'" }.join(',')}"
)
}
}
schemasByRawSchema
}
/**
* Loads and parses all *.yaml and *.avsc Avro schema files from a directory.
*
* @param rootDir the directory to load the Avro schemas from
* @return a mapping from the Avro schema file to the parsed schema
*/
static Map<File, Schema> loadSchemasFromFiles(File rootDir) {
if (!rootDir.isDirectory()) {
throw new IllegalArgumentException("'$rootDir' is not a valid directory")
}
def schemaFiles = new HashSet()
rootDir.traverse(type: FileType.FILES, nameFilter: ~/.*\.(yaml|avsc)/) {
schemaFiles << it
}
def yamlParser = new Yaml()
parse(schemaFiles.toSet()) {
if (it.path.endsWith(".yaml")) {
try {
JsonOutput.toJson(yamlParser.load(it.text))
} catch (YAMLException e) {
throw new RuntimeException("invalid YAML file '${it.path}': ${e.message}")
}
} else {
it.text
}
}
}
}
@Grab(group = 'org.apache.avro', module = 'avro', version = '1.8.0')
import groovy.transform.Immutable
import org.apache.avro.Schema
import org.apache.avro.Schema.Field
import java.util.regex.Pattern
@Immutable(knownImmutableClasses = [Pattern])
/**
* Very simple linter for Avro schemas.
*/
final class AvroSchemaLinter {
Pattern typeNamePattern = ~/([a-z][a-z0-9_]*\.)+[A-Z][a-zA-Z0-9]*/
Pattern fieldNamePattern = ~/[a-z][a-z0-9_]*/
Pattern enumSymbolPattern = ~/[A-Z][A-Z0-9_]*/
boolean doNotAllowMissingDoc = true
boolean doNotAllowAliases = true
private List<AvroSchemaLintError> checkTypeNames(Schema schema) {
assert schema.type in [Schema.Type.FIXED, Schema.Type.ENUM, Schema.Type.RECORD]
if (!typeNamePattern) {
return []
}
(schema.aliases + schema.fullName).findAll { !(it ==~ typeNamePattern) }.collect {
new AvroSchemaLintError(
message: "type name '$it' does not match '$typeNamePattern' pattern"
)
}
}
private List<AvroSchemaLintError> checkFieldNames(Field field) {
if (!fieldNamePattern) {
return []
}
(field.aliases() + field.name()).findAll { !(it ==~ fieldNamePattern) }.collect {
new AvroSchemaLintError(
message: "field name '$it' does not match '$fieldNamePattern' pattern"
)
}
}
private List<AvroSchemaLintError> checkMissingDoc(element) {
assert element instanceof Schema || element instanceof Field
if (doNotAllowMissingDoc && !element.doc?.trim()) {
return [new AvroSchemaLintError(message: 'missing documentation')]
}
return []
}
private List<AvroSchemaLintError> checkAliases(element) {
assert element instanceof Schema || element instanceof Field
if (doNotAllowAliases && element.aliases) {
return [new AvroSchemaLintError(message: 'aliases are not allowed')]
}
return []
}
private List<AvroSchemaLintError> visitFixed(Schema schema) {
assert schema.type == Schema.Type.FIXED
checkTypeNames(schema) + checkMissingDoc(schema) + checkAliases(schema)
}
private List<AvroSchemaLintError> visitEnum(Schema schema) {
assert schema.type == Schema.Type.ENUM
def errors = checkTypeNames(schema) + checkMissingDoc(schema) + checkAliases(schema)
if (enumSymbolPattern) {
schema.enumSymbols.findAll { !(it ==~ enumSymbolPattern) }.each {
errors << new AvroSchemaLintError(
message: "enum symbol '$it' does not match '$enumSymbolPattern' pattern"
)
}
}
errors
}
private List<AvroSchemaLintError> visitRecord(Schema schema, Set<Schema> alreadyVisited) {
assert schema.type == Schema.Type.RECORD
def errors = checkTypeNames(schema) + checkMissingDoc(schema) + checkAliases(schema)
schema.fields.each {
def fieldErrors = visitField(it, alreadyVisited + schema)
if (fieldErrors) {
errors << new AvroSchemaLintError(
message: "field '${it.name()}'", errors: fieldErrors
)
}
}
errors
}
private List<AvroSchemaLintError> visitField(Field field, Set<Schema> alreadyVisited) {
checkFieldNames(field) +
checkMissingDoc(field) +
checkAliases(field) +
visitSchema(field.schema(), alreadyVisited)
}
private List<AvroSchemaLintError> visitSchema(Schema schema, Set<Schema> alreadyVisited) {
if (schema in alreadyVisited) {
return []
}
def errors
if (schema.type == Schema.Type.ARRAY) {
errors = visitSchema(schema.elementType, alreadyVisited + schema)
} else if (schema.type == Schema.Type.MAP) {
errors = visitSchema(schema.valueType, alreadyVisited + schema)
} else if (schema.type == Schema.Type.UNION) {
errors = schema.types.collect { visitSchema(it, alreadyVisited + schema) }.flatten()
} else if (schema.type == Schema.Type.FIXED) {
errors = visitFixed(schema)
} else if (schema.type == Schema.Type.ENUM) {
errors = visitEnum(schema)
} else if (schema.type == Schema.Type.RECORD) {
errors = visitRecord(schema, alreadyVisited)
}
if (errors) {
def message = schema.fullName
if (schema.fullName != schema.type.name.toLowerCase()) {
message = "${schema.type.name.toLowerCase()} '${schema.fullName}'"
}
[new AvroSchemaLintError(message: message, errors: errors)]
} else {
[]
}
}
AvroSchemaLintError lint(Schema schema) {
def errors = visitSchema(schema, new HashSet())
if (errors) {
errors[0]
} else {
null
}
}
}
import groovy.transform.Immutable
@Immutable
/**
* Tree like structure holding lint errors.
*/
class AvroSchemaLintError {
String message
List<AvroSchemaLintError> errors = []
String prettyPrint(int indentLevel, int indentLevelIncrement) {
def ret = ' ' * indentLevel + message
if (errors) {
ret = ret + ':\n' + errors.collect {
it.prettyPrint(indentLevel + indentLevelIncrement, indentLevelIncrement)
}.join("\n")
}
ret
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment