dejurin/emoji.py

## emoji.py
import os
import requests
import json
import re
from slugify import slugify

EMOJI_VERSION = '15.1'

def main():
    text = get_test_file(EMOJI_VERSION)

    print("Format text to json...")
    collected = parse_emoji_text(text)

    print(f"Processed emojis: {len(collected['full'])}")

    print("Write files: emoji.json, emoji-compact.json\n")
    write_files(collected)

    print(collected['comments'])

def get_test_file(ver):
    url = f"https://unicode.org/Public/emoji/{ver}/emoji-test.txt"
    print(f"Fetch emoji-test.txt (v{EMOJI_VERSION})", end="", flush=True)

    response = requests.get(url)
    response.raise_for_status()

    text = response.text
    print()
    return text

def parse_emoji_text(text):
    lines = text.strip().split('\n')
    collected = {'comments': '', 'full': [], 'compact': []}
    group = subgroup = None

    for line in lines:
        line = line.strip()

        if line.startswith('# group: '):
            print(f"  Processing {line[9:]}...")
            group = line[9:]
        elif line.startswith('# subgroup: '):
            subgroup = line[12:]
        elif line.startswith('#'):
            collected['comments'] += line + '\n'
        else:
            meta = parse_line(line)
            if meta:
                meta['category'] = f"{group} ({subgroup})"
                meta['group'] = group
                meta['subgroup'] = subgroup
                collected['full'].append(meta)
            else:
                collected['comments'] = collected['comments'].strip() + '\n\n'

    return collected

def parse_line(line):
    data = line.strip().split(None, 2)

    if len(data) != 3:
        return None

    codes, status, char_and_name = data
    match = re.match(r'^(.*?)\s+(E\d+\.\d+)\s+(.*?)$', char_and_name)

    if match:
        char = match.group(1)
        name = match.group(3)
        slug = slugify(name)
    else:
        return None

    return {'codes': codes, 'name': name, 'slug': slug}

def write_files(data):
    with open(rel('../src/data/emoji.json'), 'w', encoding='utf8') as full_file:
        json.dump(data['full'], full_file, ensure_ascii=False, indent=2)

def rel(*args):
    return os.path.abspath(os.path.join(os.path.dirname(__file__), *args))

if __name__ == "__main__":
    main()

"""
[
  {
    "codes": "1F432",
    "name": "dragon face",
    "slug": "dragon-face",
    "category": "Animals & Nature (animal-reptile)",
    "group": "Animals & Nature",
    "subgroup": "animal-reptile"
  },
  ...
]
"""
	import os
	import requests
	import json
	import re
	from slugify import slugify

	EMOJI_VERSION = '15.1'

	def main():
	text = get_test_file(EMOJI_VERSION)

	print("Format text to json...")
	collected = parse_emoji_text(text)

	print(f"Processed emojis: {len(collected['full'])}")

	print("Write files: emoji.json, emoji-compact.json\n")
	write_files(collected)

	print(collected['comments'])

	def get_test_file(ver):
	url = f"https://unicode.org/Public/emoji/{ver}/emoji-test.txt"
	print(f"Fetch emoji-test.txt (v{EMOJI_VERSION})", end="", flush=True)

	response = requests.get(url)
	response.raise_for_status()

	text = response.text
	print()
	return text

	def parse_emoji_text(text):
	lines = text.strip().split('\n')
	collected = {'comments': '', 'full': [], 'compact': []}
	group = subgroup = None

	for line in lines:
	line = line.strip()

	if line.startswith('# group: '):
	print(f" Processing {line[9:]}...")
	group = line[9:]
	elif line.startswith('# subgroup: '):
	subgroup = line[12:]
	elif line.startswith('#'):
	collected['comments'] += line + '\n'
	else:
	meta = parse_line(line)
	if meta:
	meta['category'] = f"{group} ({subgroup})"
	meta['group'] = group
	meta['subgroup'] = subgroup
	collected['full'].append(meta)
	else:
	collected['comments'] = collected['comments'].strip() + '\n\n'

	return collected

	def parse_line(line):
	data = line.strip().split(None, 2)

	if len(data) != 3:
	return None

	codes, status, char_and_name = data
	match = re.match(r'^(.?)\s+(E\d+\.\d+)\s+(.?)$', char_and_name)

	if match:
	char = match.group(1)
	name = match.group(3)
	slug = slugify(name)
	else:
	return None

	return {'codes': codes, 'name': name, 'slug': slug}

	def write_files(data):
	with open(rel('../src/data/emoji.json'), 'w', encoding='utf8') as full_file:
	json.dump(data['full'], full_file, ensure_ascii=False, indent=2)

	def rel(*args):
	return os.path.abspath(os.path.join(os.path.dirname(__file__), *args))

	if __name__ == "__main__":
	main()

	"""
	[
	{
	"codes": "1F432",
	"name": "dragon face",
	"slug": "dragon-face",
	"category": "Animals & Nature (animal-reptile)",
	"group": "Animals & Nature",
	"subgroup": "animal-reptile"
	},
	...
	]
	"""