Skip to content

Instantly share code, notes, and snippets.

@srikanthlogic
Created May 31, 2012 17:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save srikanthlogic/2844767 to your computer and use it in GitHub Desktop.
Save srikanthlogic/2844767 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Script to parse ZWNJ/ZWJ/ZWSP from http://dumps.wikimedia.org/tawiki/20120526/tawiki-20120526-all-titles-in-ns0.gz
# Srikanth Logic
import sys
def main():
srcPath = sys.argv[1]
destPath = sys.argv[2]
sourceFile = open(srcPath,'r')
outputFile = open(destPath,'a')
outputFile.write("ZWNJ =====================\n")
for line in sourceFile:
line = line.splitlines()[0]
if ( line.find('\xe2\x80\x8c') != -1):
outputFile.write( "* [[" + line + "]], [[" + line.replace('\xe2\x80\x8c','') + "]]\n")
outputFile.write("ZWJ =====================\n")
for line in sourceFile:
line = line.splitlines()[0]
if ( line.find('\xe2\x80\x8d') != -1):
outputFile.write( "* [[" + line + "]], [[" + line.replace('\xe2\x80\x8d','') + "]]\n")
outputFile.write("ZWSP =====================\n")
for line in sourceFile:
line = line.splitlines()[0]
if ( line.find('\xe2\x80\x8b') != -1):
outputFile.write( "* [[" + line + "]], [[" + line.replace('\xe2\x80\x8b','') + "]]\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment