-
-
Save ThisIsMissEm/13e623b91b6d89e7886dd8c731efaa9b to your computer and use it in GitHub Desktop.
A small netlify edge function to transform text/html documents with RDFa in them into text/turtle, see: https://forum.solidproject.org/t/using-netlify-functions-to-transform-rdfa-text-html-into-turtle/5765
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import type { Context } from "https://edge.netlify.com"; | |
import { Accepts } from "https://deno.land/x/accepts@2.1.1/mod.ts"; | |
// NOTE: deno-std isn't published each deno release, and esm.sh doesn't support `@types/` | |
import { RdfaParser } from "https://esm.sh/rdfa-streaming-parser@2.0.0?deno-std=0.161.0&target=deno&no-dts"; | |
import rdfSerializer from "https://esm.sh/rdf-serialize?deno-std=0.161.0&target=deno&no-dts"; | |
function convert( | |
responseText: string, | |
options: { | |
baseIRI: string; | |
format: string; | |
} | |
): Promise<string> { | |
return new Promise((resolve, reject) => { | |
const parser = new RdfaParser({ | |
baseIRI: options.baseIRI, | |
contentType: "text/html", | |
}).on("error", (err: unknown) => reject(err)); | |
const textStream = rdfSerializer.serialize(parser, { | |
contentType: options.format, | |
}); | |
// Collect the stream down to a string, technically this means we might have | |
// a hard limit on dataset size to 500mb or so, but I doubt we're likely to | |
// hit into that: 512mb is a LOT of text | |
let result = ""; | |
textStream | |
.on("error", (err: unknown) => reject(err)) | |
.on("data", (text: string) => (result += text)) | |
.on("end", () => resolve(result)); | |
parser.write(responseText); | |
parser.end(); | |
}); | |
} | |
// Get list of supported content-types from rdfSerializer, these are used when | |
// parsing the Accepts header | |
const serializableTypes = await rdfSerializer.getContentTypes(); | |
export default async ( | |
request: Request, | |
context: Context | |
): Promise<Response> => { | |
if (!request.headers.has("Accept") || request.method !== "GET") { | |
// treat as html, pass back to netlify to serve your HTML | |
return context.next(); | |
} | |
// Parse the Accept header, providing */* to catch everything that's not | |
// serializable as an RDF format: | |
const accept = new Accepts(request.headers); | |
const acceptedTypes = accept.types([...serializableTypes, "*/*"]); | |
if ( | |
// If we don't have an accepted type, | |
!acceptedTypes || | |
// Or the catch-all matched (e.g., images or CSS), | |
acceptedTypes === "*/*" | |
) { | |
// then step aside and let the upstream handle it. | |
return context.next(); | |
} | |
// Select the first accepted type, as it should be something supported by rdfSerializer | |
const responseFormat = Array.isArray(acceptedTypes) | |
? acceptedTypes[0] | |
: acceptedTypes; | |
// Fetch the response from the upstream: | |
const originalResponse = await context.next(); | |
// Tracking how long the transform takes, it can't take more than 50ms: | |
const startTime = Date.now(); | |
// If upstream response wasn't ok, we can't transform it: | |
if (!originalResponse.ok) { | |
return originalResponse; | |
} | |
// Don't try handling responses that aren't HTML, as these won't parse as RDFa: | |
// `netlify dev` gives `text/html`, real netlify gives `text/html; charset=UTF-8` | |
const contentType = originalResponse.headers.get("Content-Type"); | |
if (!contentType || !contentType?.startsWith("text/html")) { | |
return originalResponse; | |
} | |
// Transform the response to the desired format | |
const responseText = await originalResponse.text(); | |
const result = await convert(responseText, { | |
// In theory, we should be able to use context.site.url, which should be the site's URL, | |
// in reality, this is undefined in development, instead of, y'know the development servers' | |
// URL. So instead we just use the root of the request URL, hacky, I know. | |
// | |
// https://docs.netlify.com/configure-builds/environment-variables/#deploy-urls-and-metadata | |
// baseIRI: context.site.url, | |
baseIRI: new URL("/", request.url).toString(), | |
format: responseFormat, | |
}); | |
// Remove the content-length header, as we've changed the content's length by transforming it: | |
originalResponse.headers.delete("content-length"); | |
// Append a server-timing to indicate how long the transform took, it can't | |
// take more than 50ms (per https://docs.netlify.com/edge-functions/limits/): | |
originalResponse.headers.append( | |
"Server-Timing", | |
`rdfa-transform;dur=${Date.now() - startTime}` | |
); | |
// For some reason we couldn't extract any RDFa, so just return the original response: | |
if (result.length === 0) { | |
context.log(`Warning: unable to transform ${request.url}`); | |
return new Response(responseText, originalResponse); | |
} | |
// Override the content-type to text/turtle | |
originalResponse.headers.set("Content-Type", responseFormat); | |
originalResponse.headers.append("Vary", "Accept"); | |
// Return the new response: | |
return new Response(result, originalResponse); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment