Skip to content

Instantly share code, notes, and snippets.

@eculver
Created April 18, 2010 04:30
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save eculver/370004 to your computer and use it in GitHub Desktop.
Save eculver/370004 to your computer and use it in GitHub Desktop.
convert microsoft word special characters to html entities
import re
def convert_1252_codes(text):
"""Convert windows-1252 characters to appropriate html entities.
@param str String to filter
@type string/unicode
@return unicode version of filtered string
Adapted from: http://effbot.org/zone/unicode-gremlins.htm
"""
cp_1252_chars = {
# from http://www.microsoft.com/typography/unicode/1252.htm
u"\x80": u"€", # EURO SIGN
u"\x82": u"‚", # SINGLE LOW-9 QUOTATION MARK
u"\x83": u"ƒ", # LATIN SMALL LETTER F WITH HOOK
u"\x84": u"„", # DOUBLE LOW-9 QUOTATION MARK
u"\x85": u"…", # HORIZONTAL ELLIPSIS
u"\x86": u"†", # DAGGER
u"\x87": u"‡", # DOUBLE DAGGER
u"\x88": u"ˆ", # MODIFIER LETTER CIRCUMFLEX ACCENT
u"\x89": u"‰", # PER MILLE SIGN
u"\x8A": u"Š", # LATIN CAPITAL LETTER S WITH CARON
u"\x8B": u"‹", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
u"\x8C": u"Œ", # LATIN CAPITAL LIGATURE OE
u"\x8E": u"Ž", # LATIN CAPITAL LETTER Z WITH CARON
u"\x91": u"‘", # LEFT SINGLE QUOTATION MARK
u"\x92": u"’", # RIGHT SINGLE QUOTATION MARK
u"\x93": u"“", # LEFT DOUBLE QUOTATION MARK
u"\x94": u"”", # RIGHT DOUBLE QUOTATION MARK
u"\x95": u"•", # BULLET
u"\x96": u"–", # EN DASH
u"\x97": u"—", # EM DASH
u"\x98": u"˜", # SMALL TILDE
u"\x99": u"™", # TRADE MARK SIGN
u"\x9A": u"š", # LATIN SMALL LETTER S WITH CARON
u"\x9B": u"›", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
u"\x9C": u"œ", # LATIN SMALL LIGATURE OE
u"\x9E": u"ž", # LATIN SMALL LETTER Z WITH CARON
u"\x9F": u"Ÿ", # LATIN CAPITAL LETTER Y WITH DIAERESIS
}
if re.search(u"[\x80-\x9f]", text):
def fixup(m):
s = m.group(0)
return cp_1252_chars.get(s, s)
if isinstance(text, type("")):
text = unicode(text, "iso-8859-1")
text = re.sub(u"[\x80-\x9f]", fixup, text)
return unicode(text)
@quandyfactory
Copy link

This Cp1252-to-Unicode hashtable has been helpful to me:

cp_1252_chars = {
    # from http://www.microsoft.com/typography/unicode/1252.htm
    u"\x80": u"\u20AC", # EURO SIGN
    u"\x82": u"\u201A", # SINGLE LOW-9 QUOTATION MARK
    u"\x83": u"\u0192", # LATIN SMALL LETTER F WITH HOOK
    u"\x84": u"\u201E", # DOUBLE LOW-9 QUOTATION MARK
    u"\x85": u"\u2026", # HORIZONTAL ELLIPSIS
    u"\x86": u"\u2020", # DAGGER
    u"\x87": u"\u2021", # DOUBLE DAGGER
    u"\x88": u"\u02C6", # MODIFIER LETTER CIRCUMFLEX ACCENT
    u"\x89": u"\u2030", # PER MILLE SIGN
    u"\x8A": u"\u0160", # LATIN CAPITAL LETTER S WITH CARON
    u"\x8B": u"\u2039", # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    u"\x8C": u"\u0152", # LATIN CAPITAL LIGATURE OE
    u"\x8E": u"\u017D", # LATIN CAPITAL LETTER Z WITH CARON
    u"\x91": u"\u2018", # LEFT SINGLE QUOTATION MARK
    u"\x92": u"\u2019", # RIGHT SINGLE QUOTATION MARK
    u"\x93": u"\u201C", # LEFT DOUBLE QUOTATION MARK
    u"\x94": u"\u201D", # RIGHT DOUBLE QUOTATION MARK
    u"\x95": u"\u2022", # BULLET
    u"\x96": u"\u2013", # EN DASH
    u"\x97": u"\u2014", # EM DASH
    u"\x98": u"\u02DC", # SMALL TILDE
    u"\x99": u"\u2122", # TRADE MARK SIGN
    u"\x9A": u"\u0161", # LATIN SMALL LETTER S WITH CARON
    u"\x9B": u"\u203A", # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    u"\x9C": u"\u0153", # LATIN SMALL LIGATURE OE
    u"\x9E": u"\u017E", # LATIN SMALL LETTER Z WITH CARON
    u"\x9F": u"\u0178", # LATIN CAPITAL LETTER Y WITH DIAERESIS
} 

And the function:

def fix_1252_codes(text):
    """
    Replace non-standard Microsoft character codes from the Windows-1252 character set in a unicode string with proper unicode codes.
    Code originally from: http://effbot.org/zone/unicode-gremlins.htm
    """
    if re.search(u"[\x80-\x9f]", text):
        def fixup(m):
            s = m.group(0)
            return cp_1252_chars.get(s, s)
        if isinstance(text, type("")):
            text = unicode(text, "iso-8859-1")
        text = re.sub(u"[\x80-\x9f]", fixup, text)
    return text 

@eculver
Copy link
Author

eculver commented Apr 21, 2010

Yessss, thank you! My gist shall be forever replaced by fix_1252_codes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment