Skip to content

Instantly share code, notes, and snippets.

@dchaplinsky
Created October 17, 2015 23:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dchaplinsky/71f00747c9eb4577ac57 to your computer and use it in GitHub Desktop.
Save dchaplinsky/71f00747c9eb4577ac57 to your computer and use it in GitHub Desktop.
# coding: utf-8
from __future__ import unicode_literals
from unicodecsv import reader, writer
class ConvertException(Exception):
pass
def guess(tagline, mapping):
res = []
keys = mapping.keys()
keys.sort(key=len, reverse=True)
while (True):
for r in keys:
if tagline.startswith(r):
res.append(mapping[r])
tagline = tagline[len(r):].strip(" .")
break
else:
raise ConvertException(tagline)
if not tagline:
break
return list(set(res))
if __name__ == '__main__':
mapping = {}
with open("mapping_darchuk.csv", "r") as fp:
r = reader(fp)
for l in r:
mapping[l[0].strip()] = l[1].strip()
with open("tagset_darchuk_strange.csv", "r") as fp, \
open("decoded_mapping.csv", "w") as fp_out:
r = reader(fp)
w = writer(fp_out)
for l in r:
normal = l[1].strip()
try:
cnvrtd = guess(normal, mapping)
except ConvertException as e:
print("%s: %s" % (normal, e))
cnvrtd = []
w.writerow(l + [":".join(cnvrtd)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment