Skip to content

Instantly share code, notes, and snippets.

@ramsey
Last active November 11, 2021 06:02
Show Gist options
  • Save ramsey/8918f0eeb80e88c4bee6d7c9e10f7549 to your computer and use it in GitHub Desktop.
Save ramsey/8918f0eeb80e88c4bee6d7c9e10f7549 to your computer and use it in GitHub Desktop.
Parses the IANA language subtag registry into JSON, which can be queried with `jq`

Parse language subtag registry into JSON

php language-subtag-registry.php > language-subtag-registry.json

Grandfathered or redundant tags with preferred values

This produces all grandfathered tags, with or without a preferredValue, and includes all redundant tags that have a preferredValue.

cat language-subtag-registry.json \
    | jq '
        .records[]
        | select((.type == "redundant" and has("preferredValue")) or .type == "grandfathered")
        | {(.tag): .preferredValue?}
    ' \
    | jq -s -S add

Produces:

{
  "art-lojban": "jbo",
  "cel-gaulish": null,
  "en-GB-oed": "en-GB-oxendict",
  "i-ami": "ami",
  "i-bnn": "bnn",
  "i-default": null,
  "i-enochian": null,
  "i-hak": "hak",
  "i-klingon": "tlh",
  "i-lux": "lb",
  "i-mingo": null,
  "i-navajo": "nv",
  "i-pwn": "pwn",
  "i-tao": "tao",
  "i-tay": "tay",
  "i-tsu": "tsu",
  "no-bok": "nb",
  "no-nyn": "nn",
  "sgn-BE-FR": "sfb",
  "sgn-BE-NL": "vgt",
  "sgn-BR": "bzs",
  "sgn-CH-DE": "sgg",
  "sgn-CO": "csn",
  "sgn-DE": "gsg",
  "sgn-DK": "dsl",
  "sgn-ES": "ssp",
  "sgn-FR": "fsl",
  "sgn-GB": "bfi",
  "sgn-GR": "gss",
  "sgn-IE": "isg",
  "sgn-IT": "ise",
  "sgn-JP": "jsl",
  "sgn-MX": "mfs",
  "sgn-NI": "ncs",
  "sgn-NL": "dse",
  "sgn-NO": "nsl",
  "sgn-PT": "psr",
  "sgn-SE": "swl",
  "sgn-US": "ase",
  "sgn-ZA": "sfs",
  "zh-cmn": "cmn",
  "zh-cmn-Hans": "cmn-Hans",
  "zh-cmn-Hant": "cmn-Hant",
  "zh-gan": "gan",
  "zh-guoyu": "cmn",
  "zh-hakka": "hak",
  "zh-min": null,
  "zh-min-nan": "nan",
  "zh-wuu": "wuu",
  "zh-xiang": "hsn"
}
<?php
declare(strict_types=1);
const REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry';
$context = stream_context_create([
'http' => [
'user_agent' => 'language-subtag-registry-crawler/0.1',
],
]);
$registryFile = fopen(REGISTRY_URL, 'r', false, $context);
$registryData = [];
if ($registryFile === false) {
exit(1);
}
// Grab the first line from the file, which is the File-Date record.
[, $fileDate] = splitFields((string) fgets($registryFile));
$registryData['file-date'] = $fileDate;
$registryData['records'] = [];
$count = 0;
$previousKey = null;
while (($line = fgets($registryFile)) !== false) {
$line = rtrim($line);
if ($line === '%%') {
if (isset($record)) {
$registryData['records'][] = $record;
unset($record);
}
if ($count >= 5) {
//break;
}
continue;
}
if (!isset($record)) {
$count++;
$record = [];
}
[$key, $value] = splitFields($line);
if ($key !== null) {
$key = prepareKey($key);
}
switch ($key) {
case 'comments':
case 'description':
case 'prefix':
$record[$key][] = $value;
break;
case null:
switch ($previousKey) {
case 'comments':
case 'description':
case 'prefix':
// If the value doesn't exist in the array, we'll fall back to a 0.
$previousIndex = count($record[$previousKey] ?? [1]) - 1;
$record[$previousKey][$previousIndex] = ($record[$previousKey][$previousIndex] ?? '') . ' ' . $value;
break;
case null;
// Do nothing if the previous key was null; this shouldn't happen.
break;
default:
$record[$previousKey] = ($record[$previousKey] ?? '') . ' ' . $value;
break;
}
break;
default:
$record[$key] = $value;
break;
}
$previousKey = $key ?? $previousKey;
}
fclose($registryFile);
echo json_encode($registryData, JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
echo "\n";
/**
* @return array{0: string | null, 1: string}
*/
function splitFields(string $record): array
{
$fields = array_map('trim', explode(':', $record, 2));
if (count($fields) === 1) {
return [null, $fields[0]];
}
/** @var array{0: string, 1: string} */
return $fields;
}
function prepareKey(string $key): string
{
$key = strtolower($key);
$key = str_replace('-', ' ', $key);
$key = ucwords($key);
$key = str_replace(' ', '', $key);
return lcfirst($key);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment