Skip to content

Instantly share code, notes, and snippets.

@misterburton
Last active January 6, 2026 20:19
Show Gist options
  • Select an option

  • Save misterburton/e836997d1b5335982500e0520db21c24 to your computer and use it in GitHub Desktop.

Select an option

Save misterburton/e836997d1b5335982500e0520db21c24 to your computer and use it in GitHub Desktop.
AI-Powered Localization - Translation Generation Script
/**
* pre-translate.js
*
* Extracts translatable content from HTML pages and sends it to a translation API.
* Stores results in Vercel KV for fast edge-cached retrieval.
*
* REQUIREMENTS:
* - Node.js 18+
* - npm packages: fs-extra, jsdom, @vercel/kv, dotenv
* - Vercel project with KV database configured
* - Translation API endpoint (see /api/translate.js)
* - Local dev server running (vercel dev) on port 3000
*
* SETUP:
* 1. npm install fs-extra jsdom @vercel/kv dotenv
* 2. Run `vercel env pull .env.local` to get KV credentials
* 3. Create sitemap.xml listing all pages to translate
* 4. Run `vercel dev` in a separate terminal
* 5. Run `node pre-translate.js`
*
* HOW IT WORKS:
* - Reads sitemap.xml to find all pages
* - Extracts content from elements with data-l10n-id attributes
* - Computes a hash of the content to detect changes
* - Sends content to translation API for each target language
* - Stores translation hash in content-hashes.json to skip unchanged pages
*/
const fs = require('fs-extra');
const path = require('path');
const { JSDOM } = require('jsdom');
const crypto = require('crypto');
const { createClient } = require('@vercel/kv');
require('dotenv').config({ path: '.env.local' });
// Initialize Vercel KV client
const kv = createClient({
url: process.env.KV_REST_API_URL,
token: process.env.KV_REST_API_TOKEN,
});
// Configure your supported languages here
// This should match your translation API's supported languages
const LANGUAGES = [
{ code: 'af', name: 'Afrikaans' }, { code: 'sq', name: 'Albanian' }, { code: 'am', name: 'Amharic' },
{ code: 'ar', name: 'Arabic' }, { code: 'hy', name: 'Armenian' }, { code: 'az', name: 'Azerbaijani' },
{ code: 'eu', name: 'Basque' }, { code: 'be', name: 'Belarusian' }, { code: 'bn', name: 'Bengali' },
{ code: 'bs', name: 'Bosnian' }, { code: 'bg', name: 'Bulgarian' }, { code: 'ca', name: 'Catalan' },
{ code: 'ceb', name: 'Cebuano' }, { code: 'zh', name: 'Chinese' }, { code: 'hr', name: 'Croatian' },
{ code: 'cs', name: 'Czech' }, { code: 'da', name: 'Danish' }, { code: 'nl', name: 'Dutch' },
{ code: 'en', name: 'English' }, { code: 'eo', name: 'Esperanto' }, { code: 'et', name: 'Estonian' },
{ code: 'fi', name: 'Finnish' }, { code: 'fr', name: 'French' }, { code: 'gl', name: 'Galician' },
{ code: 'ka', name: 'Georgian' }, { code: 'de', name: 'German' }, { code: 'el', name: 'Greek' },
{ code: 'gu', name: 'Gujarati' }, { code: 'ht', name: 'Haitian Creole' }, { code: 'ha', name: 'Hausa' },
{ code: 'haw', name: 'Hawaiian' }, { code: 'he', name: 'Hebrew' }, { code: 'hi', name: 'Hindi' },
{ code: 'hu', name: 'Hungarian' }, { code: 'is', name: 'Icelandic' }, { code: 'ig', name: 'Igbo' },
{ code: 'id', name: 'Indonesian' }, { code: 'ga', name: 'Irish' }, { code: 'it', name: 'Italian' },
{ code: 'ja', name: 'Japanese' }, { code: 'jv', name: 'Javanese' }, { code: 'kn', name: 'Kannada' },
{ code: 'kk', name: 'Kazakh' }, { code: 'km', name: 'Khmer' }, { code: 'rw', name: 'Kinyarwanda' },
{ code: 'ko', name: 'Korean' }, { code: 'ku', name: 'Kurdish' }, { code: 'ky', name: 'Kyrgyz' },
{ code: 'lo', name: 'Lao' }, { code: 'la', name: 'Latin' }, { code: 'lv', name: 'Latvian' },
{ code: 'lt', name: 'Lithuanian' }, { code: 'lb', name: 'Luxembourgish' }, { code: 'mk', name: 'Macedonian' },
{ code: 'mg', name: 'Malagasy' }, { code: 'ms', name: 'Malay' }, { code: 'ml', name: 'Malayalam' },
{ code: 'mt', name: 'Maltese' }, { code: 'mi', name: 'Maori' }, { code: 'mr', name: 'Marathi' },
{ code: 'mn', name: 'Mongolian' }, { code: 'my', name: 'Myanmar (Burmese)' }, { code: 'ne', name: 'Nepali' },
{ code: 'no', name: 'Norwegian' }, { code: 'ny', name: 'Nyanja (Chichewa)' }, { code: 'or', name: 'Odia (Oriya)' },
{ code: 'ps', name: 'Pashto' }, { code: 'fa', name: 'Persian' }, { code: 'pl', name: 'Polish' },
{ code: 'pt', name: 'Portuguese' }, { code: 'pa', name: 'Punjabi' }, { code: 'ro', name: 'Romanian' },
{ code: 'ru', name: 'Russian' }, { code: 'sm', name: 'Samoan' }, { code: 'gd', name: 'Scots Gaelic' },
{ code: 'sr', name: 'Serbian' }, { code: 'st', name: 'Sesotho' }, { code: 'sn', name: 'Shona' },
{ code: 'sd', name: 'Sindhi' }, { code: 'si', name: 'Sinhala (Sinhalese)' }, { code: 'sk', name: 'Slovak' },
{ code: 'sl', name: 'Slovenian' }, { code: 'so', name: 'Somali' }, { code: 'es', name: 'Spanish' },
{ code: 'su', name: 'Sundanese' }, { code: 'sw', name: 'Swahili' }, { code: 'sv', name: 'Swedish' },
{ code: 'tl', name: 'Tagalog (Filipino)' }, { code: 'tg', name: 'Tajik' }, { code: 'ta', name: 'Tamil' },
{ code: 'tt', name: 'Tatar' }, { code: 'te', name: 'Telugu' }, { code: 'th', name: 'Thai' },
{ code: 'tr', name: 'Turkish' }, { code: 'tk', name: 'Turkmen' }, { code: 'uk', name: 'Ukrainian' },
{ code: 'ur', name: 'Urdu' }, { code: 'ug', name: 'Uyghur' }, { code: 'uz', name: 'Uzbek' },
{ code: 'vi', name: 'Vietnamese' }, { code: 'cy', name: 'Welsh' }, { code: 'xh', name: 'Xhosa' },
{ code: 'yi', name: 'Yiddish' }, { code: 'yo', name: 'Yoruba' }, { code: 'zu', name: 'Zulu' }
];
// Configuration
const CONFIG = {
// Your site's base URL (used to parse sitemap)
siteUrl: 'https://yoursite.com/',
// Local dev server URL for translation API
apiUrl: 'http://localhost:3000/api/translate',
// How many languages to translate concurrently
concurrencyLimit: 10,
// Timeout for each translation request (ms)
requestTimeout: 120000
};
async function preTranslate() {
const sitemapPath = path.resolve('sitemap.xml');
if (!fs.existsSync(sitemapPath)) {
console.error('Error: sitemap.xml not found');
console.error('Create a sitemap.xml file listing all pages to translate.');
return;
}
const sitemapContent = await fs.readFile(sitemapPath, 'utf8');
const urls = sitemapContent.match(/<loc>(.*?)<\/loc>/g)
.map(loc => loc.replace(/<\/?loc>/g, ''))
.filter(url => url.startsWith(CONFIG.siteUrl));
// Load existing content hashes to detect changes
const hashFile = path.resolve('content-hashes.json');
let hashes = fs.existsSync(hashFile) ? await fs.readJson(hashFile) : {};
for (const url of urls) {
let relativePath = url.replace(CONFIG.siteUrl, '');
if (!relativePath || relativePath.endsWith('/')) relativePath += 'index.html';
const filePath = path.resolve(relativePath);
if (!fs.existsSync(filePath)) {
console.warn(`File not found for URL ${url}: ${filePath}`);
continue;
}
const pageName = getPageName(relativePath);
console.log(`\nProcessing page: ${pageName} (${relativePath})`);
// Parse HTML and extract translatable content
const html = await fs.readFile(filePath, 'utf8');
const dom = new JSDOM(html);
const { document } = dom.window;
// Find all elements with data-l10n-id attribute
const translatableElements = Array.from(document.querySelectorAll('[data-l10n-id]'));
const originalContent = {};
const normalizedForHash = {};
const allIds = translatableElements.map(el => el.dataset.l10nId).filter(Boolean).sort();
// Extract content from each translatable element
allIds.forEach(id => {
const el = document.querySelector(`[data-l10n-id="${id}"]`);
if (!el) return;
let content = '';
let textForHash = '';
// Handle different element types
if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {
content = el.placeholder || '';
textForHash = content;
} else if (el.tagName === 'META') {
content = el.getAttribute('content') || '';
textForHash = content;
} else {
content = el.innerHTML.trim();
// For hash comparison, use text content only (strip dynamic elements)
const clone = el.cloneNode(true);
clone.querySelectorAll('.footer-year, .copy-button, .line-number').forEach(e => e.remove());
textForHash = clone.textContent;
}
if (content) {
originalContent[id] = content;
// Normalize text for consistent hashing
normalizedForHash[id] = textForHash
.replace(/[\u200B-\u200D\uFEFF]/g, '') // Remove zero-width chars
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
});
// Compute hash of all content to detect changes
const contentHash = crypto.createHash('sha256')
.update(JSON.stringify(normalizedForHash))
.digest('hex');
if (!hashes[pageName]) hashes[pageName] = {};
// Skip if content hasn't changed since last translation
if (hashes[pageName]._translationHash === contentHash) {
console.log(`Skipping ${pageName}: Content unchanged.`);
continue;
}
console.log(`Changes detected in ${pageName}. Translating to ${LANGUAGES.length - 1} languages...`);
// Translate to all languages except English
const languagesToTranslate = LANGUAGES.filter(l => l.code !== 'en');
const MAX_RETRIES = 2;
const RETRY_DELAY = 2000; // 2 seconds between retries
let failedLanguages = [];
// Helper function to translate a single language
async function translateLanguage(lang) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), CONFIG.requestTimeout);
try {
const response = await fetch(CONFIG.apiUrl, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
pageId: pageName,
content: originalContent,
targetLanguage: lang.name,
contentHash: contentHash,
bypassCache: true // Force fresh translation
}),
signal: controller.signal
});
clearTimeout(timeoutId);
if (!response.ok) {
const err = await response.text();
console.error(` [${lang.code}] Failed: ${err}`);
return { success: false, lang, error: err };
}
const data = await response.json();
console.log(` [${lang.code}] ${data.cached ? 'Cached' : 'Translated'}`);
return { success: true, lang };
} catch (error) {
clearTimeout(timeoutId);
const msg = error.name === 'AbortError' ? 'Timed out' : error.message;
console.error(` [${lang.code}] Error: ${msg}`);
return { success: false, lang, error: msg };
}
}
// Initial translation pass
for (let i = 0; i < languagesToTranslate.length; i += CONFIG.concurrencyLimit) {
const batch = languagesToTranslate.slice(i, i + CONFIG.concurrencyLimit);
console.log(` Batch ${Math.floor(i / CONFIG.concurrencyLimit) + 1}/${Math.ceil(languagesToTranslate.length / CONFIG.concurrencyLimit)}...`);
const results = await Promise.all(batch.map(lang => translateLanguage(lang)));
// Collect failed languages
results.forEach(result => {
if (!result.success) {
failedLanguages.push(result.lang);
}
});
}
// Retry failed languages
if (failedLanguages.length > 0) {
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
console.log(`\n Retrying ${failedLanguages.length} failed language(s) (attempt ${attempt}/${MAX_RETRIES})...`);
await new Promise(resolve => setTimeout(resolve, RETRY_DELAY));
const retryResults = await Promise.all(failedLanguages.map(lang => translateLanguage(lang)));
// Remove successful retries from failedLanguages
failedLanguages = retryResults
.filter(result => !result.success)
.map(result => result.lang);
if (failedLanguages.length === 0) {
console.log(` ✓ All retries successful!`);
break;
}
}
}
// Always save the hash after attempting translation.
// The hash tracks SOURCE CONTENT state, not API success.
// This prevents re-translating the same content due to intermittent API failures.
// Failed languages can be identified by checking KV cache directly.
hashes[pageName]._translationHash = contentHash;
await fs.writeJson(hashFile, hashes, { spaces: 2 });
if (failedLanguages.length === 0) {
console.log(`✓ Successfully translated ${pageName} to all languages.`);
} else {
console.log(`⚠ Translated ${pageName} with ${failedLanguages.length} failure(s): ${failedLanguages.map(l => l.code).join(', ')}`);
console.log(` To retry: delete _translationHash for this page in content-hashes.json`);
}
}
console.log('\nPre-translation complete.');
}
/**
* Extract page name from file path
* Examples:
* index.html -> 'home'
* about/index.html -> 'about'
* blog/post.html -> 'post'
*/
function getPageName(filePath) {
const fileName = path.basename(filePath, '.html');
const parentDir = path.dirname(filePath);
if (fileName === 'index') {
if (parentDir === '.' || parentDir === '') return 'home';
return path.basename(parentDir);
}
return fileName;
}
// Run the script
preTranslate().catch(console.error);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment