Skip to content

Instantly share code, notes, and snippets.

@TheVeryStarlk
Last active May 9, 2024 12:34
Show Gist options
  • Save TheVeryStarlk/fdd5a592c7e515d1802184a335950bfa to your computer and use it in GitHub Desktop.
Save TheVeryStarlk/fdd5a592c7e515d1802184a335950bfa to your computer and use it in GitHub Desktop.
An extension method that shapes separated Arabic letters.
using System.Text;
const string original = "سماء عماد لها وارض تسير وفوقها قمم الروابي فضاء لا انتهاء له وشمس تضيء بحسبة بين السحاب فشد كيانك الادنى برب تنال بقربه شرف الجناب!";
Console.OutputEncoding = Encoding.UTF8;
Console.WriteLine($"Before: \"{original}\", \nafter: \"{original.ShapeArabic()}\".");
internal static class StringExtensions
{
// Taken from https://en.wikipedia.org/wiki/Arabic_script_in_Unicode#Contextual_forms.
private static readonly Dictionary<char, (char Final, char Middle, char Initial)> Letters = new()
{
{ 'ا', ('ﺎ', '\0', '\0') },
{ 'ب', ('ﺐ', 'ﺒ', 'ﺑ') },
{ 'ت', ('ﺖ', 'ﺘ', 'ﺗ') },
{ 'ث', ('ﺚ', 'ﺜ', 'ﺛ') },
{ 'ج', ('ﺞ', 'ﺠ', 'ﺟ') },
{ 'ح', ('ﺢ', 'ﺤ', 'ﺣ') },
{ 'خ', ('ﺦ', 'ﺨ', 'ﺧ') },
{ 'د', ('ﺪ', '\0', '\0') },
{ 'ذ', ('ﺬ', '\0', '\0') },
{ 'ر', ('ﺮ', '\0', '\0') },
{ 'ز', ('ﺰ', '\0', '\0') },
{ 'س', ('ﺲ', 'ﺴ', 'ﺳ') },
{ 'ش', ('ﺶ', 'ﺸ', 'ﺷ') },
{ 'ص', ('ﺺ', 'ﺼ', 'ﺻ') },
{ 'ض', ('ﺾ', 'ﻀ', 'ﺿ') },
{ 'ط', ('ﻂ', 'ﻄ', 'ﻃ') },
{ 'ظ', ('ﻆ', 'ﻈ', 'ﻇ') },
{ 'ع', ('ﻊ', 'ﻌ', 'ﻋ') },
{ 'غ', ('ﻎ', 'ﻐ', 'ﻏ') },
{ 'ف', ('ﻒ', 'ﻔ', 'ﻓ') },
{ 'ق', ('ﻖ', 'ﻘ', 'ﻗ') },
{ 'ك', ('ﻚ', 'ﻜ', 'ﻛ') },
{ 'ل', ('ﻞ', 'ﻠ', 'ﻟ') },
{ 'م', ('ﻢ', 'ﻤ', 'ﻣ') },
{ 'ن', ('ﻦ', 'ﻨ', 'ﻧ') },
{ 'ه', ('ﻪ', 'ﻬ', 'ﻫ') },
{ 'و', ('ﻮ', '\0', '\0') },
{ 'ي', ('ﻲ', 'ﻴ', 'ﻳ') },
{ 'ﺁ', ('ﺂ', '\0', '\0') },
{ 'ة', ('ﺔ', '\0', '\0') },
{ 'ى', ('ﻰ', '\0', '\0') }
};
/// <summary>
/// Shapes separated Arabic letters into words.
/// </summary>
/// <example>
/// Shapes text like: "ﻝﻩﺱ" to: "سهل".
/// </example>
/// <remarks>
/// Letters that have vocalization (تشكيل) will not be shaped correctly.
/// </remarks>
/// <param name="original">The original, unshaped Arabic <see cref="string"/>.</param>
/// <returns>The fully shaped Arabic <see cref="string"/>.</returns>
public static string ShapeArabic(this string original)
{
if (original.Length < 2)
{
return original;
}
return string.Create(
original.Length,
original,
(span, @string) =>
{
var reversed = original.Length - 1;
for (var index = 0; index < @string.Length; index++)
{
var current = @string[reversed];
if (!Letters.TryGetValue(current, out var value))
{
span[index] = current;
reversed--;
continue;
}
// Try to find if neighbouring letters have initial/final shapes.
// We use that to construct the final shape of the letter in the "middle".
var initial = index - 1 >= 0
&& Letters.TryGetValue(@string[reversed + 1], out var right)
&& right.Final is not '\0';
var final = @string.Length > index + 1
&& Letters.TryGetValue(@string[reversed - 1], out var left)
&& left.Initial is not '\0';
// Try to construct the shape of the letter and count for all shape possibilities.
// And in case of failure, always fallback to the nearest "shape".
span[index] = (initial, final) switch
{
(true, true) => value.Middle is '\0' ? value.Final is '\0' ? value.Initial : value.Final : value.Middle,
(true, false) => value.Initial is '\0' ? current : value.Initial,
(false, true) => value.Final is '\0' ? current : value.Final,
_ => current
};
reversed--;
}
});
}
}
@TheVeryStarlk
Copy link
Author

TheVeryStarlk commented May 7, 2024

Updated benchmark with 5K characters.

// * Summary *

BenchmarkDotNet v0.13.12, Windows 11 (10.0.22621.3447/22H2/2022Update/SunValley2)
AMD Ryzen 3 3100, 1 CPU, 8 logical and 4 physical cores
.NET SDK 8.0.204
  [Host]     : .NET 8.0.4 (8.0.424.16909), X64 RyuJIT AVX2
  DefaultJob : .NET 8.0.4 (8.0.424.16909), X64 RyuJIT AVX2


| Method      | Mean     | Error    | StdDev   | Gen0   | Allocated |
|------------ |---------:|---------:|---------:|-------:|----------:|
| ShapeArabic | 77.05 us | 0.327 us | 0.306 us | 2.3193 |   9.86 KB |

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment