Last active
September 11, 2017 12:08
-
-
Save answerquest/74c13f73f1bfb21c3177 to your computer and use it in GitHub Desktop.
Convert legacy font in Pune's Budget book to Unicode Devnagri script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> | |
<head><title>PuneBudget <==> यूनिकोड परिवर्तित्र (12-11-2014) </title> | |
<script type="text/javascript"> | |
// for Marathi script used by Pune Municipal Corporation in their budget book, http://www.punecorporation.org/informpdf/budget/1415%20SC%20BUDGET%20BOOK.pdf | |
function convert_to_unicode() | |
{ | |
var array_one = new Array( | |
//beginning replacements : need to do these before anything else | |
//errors: | |
"Pv=§=§" , "टुं" , | |
"ìg¡«" , "र्व्है" , | |
"P²´" , "P´" , | |
"@lh§§§ §Wq" , "आळंदी" , | |
//errors done | |
//ZAPPING the $,=,Ÿ etc completely. That would free us up from having to define so many combos for क , फ , ट etc. | |
"$" , "" , //ZAPPING the $.. how symbolic.. | |
"Ÿ", "" , | |
"=" , "" , | |
//Zapping done | |
//end beginning replacement section | |
/* //Optional : Numbers conversion : We might not want to convert the digits. The Budget book keeps the digits in English script only. | |
"1" , "१" , | |
"2" , "२" , | |
"3" , "३" , | |
"4" , "४" , | |
"5" , "५" , | |
"6" , "६" , | |
"7" , "७" , | |
"8" , "८" , | |
"9" , "९" , | |
"0" , "०" , | |
*/ | |
"@l°" , "ऑ" , | |
"@ln" , "ओ" , | |
"@l¡" , "औ" , | |
"@l¢" , "औं" , | |
"@l" , "आ" , | |
"@" , "अ" , | |
"A©§" , "ईं" , | |
"A©" , "ई" , | |
"A" , "इ" , | |
"B" , "उ" , | |
"C" , "ऊ" , //usually appearing with $ next to it, but zapping the $. | |
"Dn" , "ऐ" , | |
"D" , "ए" , | |
"E" , "ऋ" , | |
"G" , "क" , // test: changed from G$.. trying out zapping the $ entirely | |
"Š" , "क्" , | |
"³" , "क्" , | |
"¸" , "क्क" , | |
"¹" , "क्व" , | |
"º" , "क्त" , | |
"H" , "ख" , | |
"»" , "ख्" , | |
"¼" , "ख्र" , | |
"I" , "ग" , | |
"½" , "ग्" , | |
"J" , "घ" , | |
"¿" , "घ्" , | |
"L" , "च" , | |
"À" , "च्" , | |
"ƒ" , "च्च" , | |
"M" , "छ" , | |
"N" , "ज" , | |
"Á" , "ज्" , | |
"‚l" , "ज्ज" , | |
"O" , "झ" , | |
"Â" , "झ्" , | |
"Äl" , "ञ" , | |
"P—" , "ट्र" , | |
"P" , "ट" , | |
"ï" , "ष्ट" , | |
"Å" , "ट्ट" , | |
"Q" , "ठ" , | |
"Æ" , "ट्ठ" , | |
"Ç" , "ठ्ठ" , | |
"ð" , "ष्ठ" , | |
"R—" , "ड्र" , | |
"R" , "ड" , | |
"È" , "ड्ड" , | |
"S" , "ढ" , | |
"T" , "ण" , | |
"Ê" , "ण्" , | |
"U" , "त" , | |
"Ë" , "त्" , | |
"Îl" , "त्त" , | |
"Î" , "त्त्" , | |
"V" , "थ" , | |
"Ϩl" , "थ" , | |
"Ï" , "थ्" , | |
"W" , "द" , | |
"Ð" , "द्र" , | |
"Ö" , "द्र" , | |
"Û" , "द्व" , | |
"Ú" , "द्य" , | |
"Ô" , "द्द" , | |
"Ñ" , "दृ" , | |
"×" , "ब्द" , | |
"Õ" , "द्ध" , | |
"Ù" , "द्म" , | |
"X" , "ध" , | |
"Ü" , "ध्" , | |
"Y" , "न" , | |
"Ý" , "न्" , | |
"Þ" , "न्न" , | |
"¾" , "ग्न" , | |
"m" , "प" , | |
"ß" , "प्" , | |
"à" , "प्र" , | |
"á", "प्त" , | |
"[ l", "फा", | |
"[" , "फ" , | |
"â" , "फ्" , | |
"}" , "ब" , | |
"ã" , "ब्" , | |
"]" , "भ" , | |
"ä" , "भ्" , | |
"^" , "म" , | |
"å" , "म्" , | |
"_" , "य" , | |
"æ" , "य्" , | |
"ç" , "्य" , | |
"`" , "र" , | |
"è" , "ऱ्" , | |
"é" , "रु" , | |
"ê" , "रू" , | |
"«" , "्र" , // makes ट्र ड्र etc | |
"´" , "्र" , // makes ट्र ड्र etc | |
"a" , "ल" , | |
"b" , "ल" , | |
"ë" , "ल्" , | |
"„" , "ल्ल" , | |
"h" , "ळ" , | |
"ù" , "ळ्" , | |
"d" , "श" , | |
"í" , "श्" , | |
"c" , "व" , | |
"ì" , "व्" , | |
"e" , "ष" , | |
"îl" , "ष" , | |
"î" , "ष्" , //make sure this comes after the îl line | |
"f" , "स" , | |
"ñ" , "स्" , | |
"ó" , "स्त्र" , | |
"g" , "ह" , | |
"ô" , "ह्" , | |
"ö" , "हृ" , | |
"ø" , "ह्य" , | |
"i" , "क्ष" , | |
"úl" , "क्ष" , | |
"ú" , "क्ष्" , | |
"j" , "ज्ञ" , | |
"k" , "श्र" , | |
"Ì" , "त्र" , | |
"Í" , "त्र्" , | |
//symbols: | |
"&" , "।" , | |
//"\"", "'", | |
">" , "?" , | |
//Maatraas: | |
// combos of preceding and succeeding matras. Replacing them with © which will be processed as preceding र् as well as whichever succeeding matra they correspond to | |
"t" , "©ी" , //like in र्वी | |
"|" , "©े" , | |
"ª" , "©ं" , // "ª" corresponds to the matra in "पर्यंत", | |
"u" , "©ीं" , //like in र्थीं | |
"\\" , "©ें" , // "\" corresponds to the triple-matra in र्जें. | |
"²" , "्" , // keep at end, after other uses of ² have been done | |
"ln°" , "ॉ" , //added on 17 Apr 2015 after getting a कोॅलेज | |
"l°" , "ॉ" , | |
"ln" , "ो" , | |
"l¡" , "ौ" , | |
"l¢" , "ौं" , | |
"q" , "ी" , //q for thin-on-the-right letters like न, r for fat letters like क | |
"r" , "ी" , | |
"v" , "ु" , | |
"þ" , "ु" , //another version that's not directly under the char but a little to the right | |
"x" , "ू" , | |
"ÿ" , "ू" , //another version that's not directly under the char but a little to the right | |
"¥" , "ृ" , | |
"n" , "े" , | |
"¡" , "ै" , | |
"§" , "ं" , | |
"¨" , "ं" , | |
"±" , "ँ" , | |
// ":" , "ः" , | |
"…" , ":" , //causing issues.. most places it's s simple colon only. | |
"°" , "ॅ" , | |
//"" , "ऽ" , | |
"l" , "ा" , | |
//"" , "।" , // dandaa | |
//"" , "॰" | |
"s" , "ीं" , | |
"{" , "ें" , | |
"्ो" , "े" , | |
"्ौ" , "ै" , | |
"्ाे" , "े" , | |
"्ाा" , "ा" , | |
"ाे" , "ो" , | |
"ाे" , "ो" , | |
"ाै" , "ौ" , | |
"्ा" , "" , | |
"ंु" , "ुं" , | |
"ओ े" , "ओ" , // "ओ" + "े" , | |
"ोे" , "ो" , | |
"ाे" , "ो" , | |
"ईंं" , "ईं" ) | |
var array_one_length = array_one.length ; | |
document.getElementById("unicode_text").value = "You have chosen SIMPLE TEXT in PMC-Budget font to convert into Unicode. Conversion in progress.." ; | |
var modified_substring = document.getElementById("legacy_text").value ; | |
/* | |
// PDF conversion: zap all repeating spaces | |
document.getElementById("legacy_text").value = document.getElementById("legacy_text").value.replace(/\ {2,}/g, ' '); | |
console.log (document.getElementById("legacy_text").value); | |
// PDF conversion : zapping of repeating spaces done | |
*/ | |
//****************************************************** | |
// Break the long text into small bunches of max. max_text_size characters each. | |
//****************************************************** | |
var text_size = document.getElementById("legacy_text").value.length ; | |
var processed_text = '' ; //blank | |
var sthiti1 = 0 ; var sthiti2 = 0 ; var chale_chalo = 1 ; | |
var max_text_size = 6000; | |
while ( chale_chalo == 1 ) | |
{ | |
sthiti1 = sthiti2 ; | |
if ( sthiti2 < ( text_size - max_text_size ) ) | |
{ | |
sthiti2 += max_text_size ; | |
while (document.getElementById("legacy_text").value.charAt ( sthiti2 ) != ' ') {sthiti2--;} | |
} | |
else { sthiti2 = text_size ; chale_chalo = 0 } | |
var modified_substring = document.getElementById("legacy_text").value.substring ( sthiti1, sthiti2 ) ; | |
Replace_Symbols( ) ; | |
processed_text += modified_substring ; | |
//****************************************************** | |
// Breaking part code over | |
//****************************************************** | |
document.getElementById("unicode_text").value = processed_text ; | |
} | |
//-------------------------------------------------- | |
function Replace_Symbols( ) | |
{ | |
//substitute array_two elements in place of corresponding array_one elements | |
if ( modified_substring != "" ) // if string to be converted is non-blank then no need of any processing. | |
{ | |
for(input_symbol_idx = 0; input_symbol_idx < array_one_length-1; input_symbol_idx=input_symbol_idx+2 ) | |
{ | |
idx = 0 ; // index of the symbol being searched for replacement | |
while (idx != -1 ) //whie-00 | |
{ | |
modified_substring = modified_substring.replace( array_one[ input_symbol_idx ] , array_one[input_symbol_idx+1] ) | |
idx = modified_substring.indexOf( array_one[input_symbol_idx] ) | |
} // end of while-00 loop | |
} // end of for loop | |
// remove maatras typed wrongly | |
modified_substring = modified_substring.replace( /([ंँ॰])([ािीुूृेैोौ])/g , "$2$1" ); | |
modified_substring = modified_substring.replace( /([ािीुूृेैोौंँ])([ािीुूृेैोौ])/g , "$1" ) ; | |
// code for replacing "z", "Z" and "o" with "ि" (chhotee ee kii maatraa) and correcting its position too. | |
//added p to try getting in matra like भिं | |
//added µ and ‹ for ़ (dot at bottom), it's preceding the char so need to bump it ahead. | |
modified_substring = modified_substring.replace( /([µ‹opzZ])([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़©])/g , "$2$1" ) ; | |
modified_substring = modified_substring.replace( /([µ‹opzZ])(्)([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़©])/g , "$2$3$1" ) ; | |
modified_substring = modified_substring.replace( /([µ‹opzZ])(्)([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़©])/g , "$2$3$1" ) ; | |
modified_substring = modified_substring.replace( /z/g , "ि" ) ; | |
modified_substring = modified_substring.replace( /Z/g , "ि" ) ; | |
modified_substring = modified_substring.replace( /o/g , "ि" ) ; | |
modified_substring = modified_substring.replace( /p/g , "िं" ) ; | |
modified_substring = modified_substring.replace( /µ/g , "़" ) ; | |
modified_substring = modified_substring.replace( /‹/g , "़" ) ; | |
//============================================================= | |
//Eliminating "©"(reph) and putting 'half - r' at proper position for this. | |
// This bumps the char back | |
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([ािीुूृेैोौंँ]*)([©])/g , "$3$1$2" ) ; | |
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([©])/g , "$3$1$2" ) ; | |
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([©])/g , "$3$1$2" ) ; | |
modified_substring = modified_substring.replace( /©/g , "र्" ) ; | |
//============================================================= | |
/* | |
// PDF conversion fixing. This bumps the ° back | |
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([ािीुूृेैोौंँ]*)([°])/g , "$3$1$2" ) ; | |
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([°])/g , "$3$1$2" ) ; | |
modified_substring = modified_substring.replace( /([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलवशषसहक़ख़ग़ज़ड़ढ़फ़])([्])([°])/g , "$3$1$2" ) ; | |
modified_substring = modified_substring.replace( /°/g , "ॅ" ) ; | |
// PDF conversion fixing done | |
*/ | |
}//end of IF statement meant to supress processing of blank string. | |
} // end of the function Replace_Symbols | |
document.getElementById("unicode_text").select(); | |
} // end of legacy_to_unicode function | |
</script> | |
</head> | |
<body> | |
<form name="form1"> | |
<table width="100%"> | |
<tr width="100%"> | |
<td width="35%"> | |
<b>PuneBudget</b> font text-box<br/> | |
<textarea name="input" id="legacy_text" cols="60" rows="20" ></textarea> | |
<br> | |
<!-- onFocus="this.select();"--> | |
<div align="center"> | |
<input type="button" id="converter1" name="converter" value=" Convert to Unicode >> " onClick="convert_to_unicode();" accesskey="c" title="शॉर्टकट shift+alt+c"> | |
</div> <br> | |
<input type="button" value=" Clear input box " onClick="document.getElementById('legacy_text').value='';document.getElementById('legacy_text').focus();" accesskey="x" title="शॉर्टकट shift+alt+x"> | |
</td> | |
<td width="65%"> | |
<b>Unicode</b> text-box<br/> | |
<textarea name="unicode_text" id="unicode_text" cols="80" rows="25" style="font-size: 15px"></textarea> | |
</td> | |
</tr> | |
</table> | |
<br /> | |
</form> | |
<h2>PuneBudget <==> यूनिकोड परिवर्तित्र </h2> | |
<p>Originally created to convert the text in <a href="http://www.punecorporation.org">Pune Municipal Corporation's</a> 2014-15 Standing Committee Budget PDF document into Unicode format. <a href="http://www.punecorporation.org/informpdf/budget/1415%20SC%20BUDGET%20BOOK.pdf">See the PDF here</a>. | |
<br> Please see the source HTML code of this file to see the substitution list. | |
<br> Code adapted from other legacy font to Unicode converters here: <a href="https://sites.google.com/site/technicalhindi/home/converters">https://sites.google.com/site/technicalhindi/home/converters</a></p> | |
<p>Want to use this offline? Just Save this page to your computer, or copy over the source code. No extra dependencies!<br> | |
Found any inaccuracy in conversion? Please email nikhil.js [at] gmail.com with source text and result.</p> | |
<br>यूनिकोड, विश्व की गैर-रोमन भाषाओं के लिये वरदान है। | |
</body> | |
</html> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment