Skip to content

Instantly share code, notes, and snippets.

@JGVerdugo
Created November 8, 2012 03:31
Show Gist options
  • Save JGVerdugo/4036521 to your computer and use it in GitHub Desktop.
Save JGVerdugo/4036521 to your computer and use it in GitHub Desktop.
Replaces punctuation with tags
#!/usr/bin/python
import codecs
import os
import re
import unicodedata
# Loads a UTF-8 text file into memory as a character string
def readDoc(filename):
file = codecs.open(filename, "r", "UTF-8")
string = file.read()
file.close()
return string
# Writes a character string (doc) as a UTF-8 text file
def writeDoc(doc):
newfile = codecs.open("newfile.txt", "w", "UTF-8")
newfile.write(doc)
newfile.close()
# Gets the character's Unicode category and its name. Returns a tuple containing these two values.
def getCharName(char):
category = unicodedata.category(char)
name = unicodedata.name(char)
values = (category, name)
return values
# Finds text elements in the document and replaces all punctuation characters with a tag containing
# their names, at string level
def handleText(doc):
oldString = doc
newString = ""
for char in oldString:
category = unicodedata.category(char)
if re.match("[PS][cdefimkos]*", category):
char = "<" + unicodedata.name(char) + ">"
newString += char
return newString
doc = readDoc("text.txt")
doc = handleText(doc)
writeDoc(doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment