Skip to content

Instantly share code, notes, and snippets.

@answerquest
Last active September 11, 2017 12:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save answerquest/74c13f73f1bfb21c3177 to your computer and use it in GitHub Desktop.
Save answerquest/74c13f73f1bfb21c3177 to your computer and use it in GitHub Desktop.
Convert legacy font in Pune's Budget book to Unicode Devnagri script
<!DOCTYPE html>
<html>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<head><title>PuneBudget <==> यूनिकोड परिवर्तित्र (12-11-2014) </title>
<script type="text/javascript">
// for Marathi script used by Pune Municipal Corporation in their budget book, http://www.punecorporation.org/informpdf/budget/1415%20SC%20BUDGET%20BOOK.pdf
function convert_to_unicode()
{
var array_one = new Array(
//beginning replacements : need to do these before anything else
//errors:
"Pv=§=§" , "टुं" ,
"ìg¡«" , "र्व्है" ,
"P²´" , "P´" ,
"@lh§§§ §Wq" , "आळंदी" ,
//errors done
//ZAPPING the $,=,Ÿ etc completely. That would free us up from having to define so many combos for क , फ , ट etc.
"$" , "" , //ZAPPING the $.. how symbolic..
"Ÿ", "" ,
"=" , "" ,
//Zapping done
//end beginning replacement section
/* //Optional : Numbers conversion : We might not want to convert the digits. The Budget book keeps the digits in English script only.
"1" , "१" ,
"2" , "२" ,
"3" , "३" ,
"4" , "४" ,
"5" , "५" ,
"6" , "६" ,
"7" , "७" ,
"8" , "८" ,
"9" , "९" ,
"0" , "०" ,
*/
"@l°" , "ऑ" ,
"@ln" , "ओ" ,
"@l¡" , "औ" ,
"@l¢" , "औं" ,
"@l" , "आ" ,
"@" , "अ" ,
"A©§" , "ईं" ,
"A©" , "ई" ,
"A" , "इ" ,
"B" , "उ" ,
"C" , "ऊ" , //usually appearing with $ next to it, but zapping the $.
"Dn" , "ऐ" ,
"D" , "ए" ,
"E" , "ऋ" ,
"G" , "क" , // test: changed from G$.. trying out zapping the $ entirely
"Š" , "क्" ,
"³" , "क्" ,
"¸" , "क्क" ,
"¹" , "क्व" ,
"º" , "क्त" ,
"H" , "ख" ,
"»" , "ख्" ,
"¼" , "ख्र" ,
"I" , "ग" ,
"½" , "ग्" ,
"J" , "घ" ,
"¿" , "घ्" ,
"L" , "च" ,
"À" , "च्" ,
"ƒ" , "च्च" ,
"M" , "छ" ,
"N" , "ज" ,
"Á" , "ज्" ,
"‚l" , "ज्ज" ,
"O" , "झ" ,
"Â" , "झ्" ,
"Äl" , "ञ" ,
"P—" , "ट्र" ,
"P" , "ट" ,
"ï" , "ष्ट" ,
"Å" , "ट्ट" ,
"Q" , "ठ" ,
"Æ" , "ट्ठ" ,
"Ç" , "ठ्ठ" ,
"ð" , "ष्ठ" ,
"R—" , "ड्र" ,
"R" , "ड" ,
"È" , "ड्ड" ,
"S" , "ढ" ,
"T" , "ण" ,
"Ê" , "ण्" ,
"U" , "त" ,
"Ë" , "त्" ,
"Îl" , "त्त" ,
"Î" , "त्त्" ,
"V" , "थ" ,
"Ϩl" , "थ" ,
"Ï" , "थ्" ,
"W" , "द" ,
"Ð" , "द्र" ,
"Ö" , "द्र" ,
"Û" , "द्व" ,
"Ú" , "द्य" ,
"Ô" , "द्द" ,
"Ñ" , "दृ" ,
"×" , "ब्द" ,
"Õ" , "द्ध" ,
"Ù" , "द्म" ,
"X" , "ध" ,
"Ü" , "ध्" ,
"Y" , "न" ,
"Ý" , "न्" ,
"Þ" , "न्न" ,
"¾" , "ग्न" ,
"m" , "प" ,
"ß" , "प्" ,
"à" , "प्र" ,
"á", "प्त" ,
"[ l", "फा",
"[" , "फ" ,
"â" , "फ्" ,
"}" , "ब" ,
"ã" , "ब्" ,
"]" , "भ" ,
"ä" , "भ्" ,
"^" , "म" ,
"å" , "म्" ,
"_" , "य" ,
"æ" , "य्" ,
"ç" , "्य" ,
"`" , "र" ,
"è" , "ऱ्" ,
"é" , "रु" ,
"ê" , "रू" ,
"«" , "्र" , // makes ट्र ड्र etc
"´" , "्र" , // makes ट्र ड्र etc
"a" , "ल" ,
"b" , "ल" ,
"ë" , "ल्" ,
"„" , "ल्ल" ,
"h" , "ळ" ,
"ù" , "ळ्" ,
"d" , "श" ,
"í" , "श्" ,
"c" , "व" ,
"ì" , "व्" ,
"e" , "ष" ,
"îl" , "ष" ,
"î" , "ष्" , //make sure this comes after the îl line
"f" , "स" ,
"ñ" , "स्" ,
"ó" , "स्त्र" ,
"g" , "ह" ,
"ô" , "ह्" ,
"ö" , "हृ" ,
"ø" , "ह्य" ,
"i" , "क्ष" ,
"úl" , "क्ष" ,
"ú" , "क्ष्" ,
"j" , "ज्ञ" ,
"k" , "श्र" ,
"Ì" , "त्र" ,
"Í" , "त्र्" ,
//symbols:
"&" , "।" ,
//"\"", "'",
">" , "?" ,
//Maatraas:
// combos of preceding and succeeding matras. Replacing them with © which will be processed as preceding र् as well as whichever succeeding matra they correspond to
"t" , "©ी" , //like in र्वी
"|" , "©े" ,
"ª" , "©ं" , // "ª" corresponds to the matra in "पर्यंत",
"u" , "©ीं" , //like in र्थीं
"\\" , "©ें" , // "\" corresponds to the triple-matra in र्जें.
"²" , "्" , // keep at end, after other uses of ² have been done
"ln°" , "ॉ" , //added on 17 Apr 2015 after getting a कोॅलेज
"l°" , "ॉ" ,
"ln" , "ो" ,
"l¡" , "ौ" ,
"l¢" , "ौं" ,
"q" , "ी" , //q for thin-on-the-right letters like न, r for fat letters like क
"r" , "ी" ,
"v" , "ु" ,
"þ" , "ु" , //another version that's not directly under the char but a little to the right
"x" , "ू" ,
"ÿ" , "ू" , //another version that's not directly under the char but a little to the right
"¥" , "ृ" ,
"n" , "े" ,
"¡" , "ै" ,
"§" , "ं" ,
"¨" , "ं" ,
"±" , "ँ" ,
// ":" , "ः" ,
"…" , ":" , //causing issues.. most places it's s simple colon only.
"°" , "ॅ" ,
//"" , "ऽ" ,
"l" , "ा" ,
//"" , "।" , // dandaa
//"" , "॰"
"s" , "ीं" ,
"{" , "ें" ,
"्ो" , "े" ,
"्ौ" , "ै" ,
"्ाे" , "े" ,
"्ाा" , "ा" ,
"ाे" , "ो" ,
"ाे" , "ो" ,
"ाै" , "ौ" ,
"्ा" , "" ,
"ंु" , "ुं" ,
"ओ े" , "ओ" , // "ओ" + "े" ,
"ोे" , "ो" ,
"ाे" , "ो" ,
"ईंं" , "ईं" )
var array_one_length = array_one.length ;
document.getElementById("unicode_text").value = "You have chosen SIMPLE TEXT in PMC-Budget font to convert into Unicode. Conversion in progress.." ;
var modified_substring = document.getElementById("legacy_text").value ;
/*
// PDF conversion: zap all repeating spaces
document.getElementById("legacy_text").value = document.getElementById("legacy_text").value.replace(/\ {2,}/g, ' ');
console.log (document.getElementById("legacy_text").value);
// PDF conversion : zapping of repeating spaces done
*/
//******************************************************
// Break the long text into small bunches of max. max_text_size characters each.
//******************************************************
var text_size = document.getElementById("legacy_text").value.length ;
var processed_text = '' ; //blank
var sthiti1 = 0 ; var sthiti2 = 0 ; var chale_chalo = 1 ;
var max_text_size = 6000;
while ( chale_chalo == 1 )
{
sthiti1 = sthiti2 ;
if ( sthiti2 < ( text_size - max_text_size ) )
{
sthiti2 += max_text_size ;
while (document.getElementById("legacy_text").value.charAt ( sthiti2 ) != ' ') {sthiti2--;}
}
else { sthiti2 = text_size ; chale_chalo = 0 }
var modified_substring = document.getElementById("legacy_text").value.substring ( sthiti1, sthiti2 ) ;
Replace_Symbols( ) ;
processed_text += modified_substring ;
//******************************************************
// Breaking part code over
//******************************************************
document.getElementById("unicode_text").value = processed_text ;
}
//--------------------------------------------------
function Replace_Symbols( )
{
//substitute array_two elements in place of corresponding array_one elements
if ( modified_substring != "" ) // if string to be converted is non-blank then no need of any processing.
{
for(input_symbol_idx = 0; input_symbol_idx < array_one_length-1; input_symbol_idx=input_symbol_idx+2 )
{
idx = 0 ; // index of the symbol being searched for replacement
while (idx != -1 ) //whie-00
{
modified_substring = modified_substring.replace( array_one[ input_symbol_idx ] , array_one[input_symbol_idx+1] )
idx = modified_substring.indexOf( array_one[input_symbol_idx] )
} // end of while-00 loop
} // end of for loop
// remove maatras typed wrongly
modified_substring = modified_substring.replace( /([ंँ॰])([ािीुूृेैोौ])/g , "$2$1" );
modified_substring = modified_substring.replace( /([ािीुूृेैोौंँ])([ािीुूृेैोौ])/g , "$1" ) ;
// code for replacing "z", "Z" and "o" with "ि" (chhotee ee kii maatraa) and correcting its position too.
//added p to try getting in matra like भिं
//added µ and ‹ for ़ (dot at bottom), it's preceding the char so need to bump it ahead.
modified_substring = modified_substring.replace( /([µ‹opzZ])([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़©])/g , "$2$1" ) ;
modified_substring = modified_substring.replace( /([µ‹opzZ])(्)([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़©])/g , "$2$3$1" ) ;
modified_substring = modified_substring.replace( /([µ‹opzZ])(्)([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़©])/g , "$2$3$1" ) ;
modified_substring = modified_substring.replace( /z/g , "ि" ) ;
modified_substring = modified_substring.replace( /Z/g , "ि" ) ;
modified_substring = modified_substring.replace( /o/g , "ि" ) ;
modified_substring = modified_substring.replace( /p/g , "िं" ) ;
modified_substring = modified_substring.replace( /µ/g , "़" ) ;
modified_substring = modified_substring.replace( /‹/g , "़" ) ;
//=============================================================
//Eliminating "©"(reph) and putting 'half - r' at proper position for this.
// This bumps the char back
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([ािीुूृेैोौंँ]*)([©])/g , "$3$1$2" ) ;
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([©])/g , "$3$1$2" ) ;
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([©])/g , "$3$1$2" ) ;
modified_substring = modified_substring.replace( /©/g , "र्" ) ;
//=============================================================
/*
// PDF conversion fixing. This bumps the ° back
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([ािीुूृेैोौंँ]*)([°])/g , "$3$1$2" ) ;
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([°])/g , "$3$1$2" ) ;
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([°])/g , "$3$1$2" ) ;
modified_substring = modified_substring.replace( /°/g , "ॅ" ) ;
// PDF conversion fixing done
*/
}//end of IF statement meant to supress processing of blank string.
} // end of the function Replace_Symbols
document.getElementById("unicode_text").select();
} // end of legacy_to_unicode function
</script>
</head>
<body>
<form name="form1">
<table width="100%">
<tr width="100%">
<td width="35%">
<b>PuneBudget</b> font text-box<br/>
<textarea name="input" id="legacy_text" cols="60" rows="20" ></textarea>
<br>
<!-- onFocus="this.select();"-->
<div align="center">
<input type="button" id="converter1" name="converter" value=" Convert to Unicode >> " onClick="convert_to_unicode();" accesskey="c" title="शॉर्टकट shift+alt+c">
</div> <br>
<input type="button" value=" Clear input box " onClick="document.getElementById('legacy_text').value='';document.getElementById('legacy_text').focus();" accesskey="x" title="शॉर्टकट shift+alt+x">
</td>
<td width="65%">
<b>Unicode</b> text-box<br/>
<textarea name="unicode_text" id="unicode_text" cols="80" rows="25" style="font-size: 15px"></textarea>
</td>
</tr>
</table>
<br />
</form>
<h2>PuneBudget <==> यूनिकोड परिवर्तित्र </h2>
<p>Originally created to convert the text in <a href="http://www.punecorporation.org">Pune Municipal Corporation's</a> 2014-15 Standing Committee Budget PDF document into Unicode format. <a href="http://www.punecorporation.org/informpdf/budget/1415%20SC%20BUDGET%20BOOK.pdf">See the PDF here</a>.
<br> Please see the source HTML code of this file to see the substitution list.
<br> Code adapted from other legacy font to Unicode converters here: <a href="https://sites.google.com/site/technicalhindi/home/converters">https://sites.google.com/site/technicalhindi/home/converters</a></p>
<p>Want to use this offline? Just Save this page to your computer, or copy over the source code. No extra dependencies!<br>
Found any inaccuracy in conversion? Please email nikhil.js [at] gmail.com with source text and result.</p>
<br>यूनिकोड, विश्व की गैर-रोमन भाषाओं के लिये वरदान है।
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment