Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
# -*- coding: utf-8 -*-
# Script to parse ZWNJ/ZWJ/ZWSP from http://dumps.wikimedia.org/tawiki/20120526/tawiki-20120526-all-titles-in-ns0.gz
# Srikanth Logic
import sys
def main():
srcPath = sys.argv[1]
destPath = sys.argv[2]
sourceFile = open(srcPath,'r')
outputFile = open(destPath,'a')
outputFile.write("ZWNJ =====================\n")
for line in sourceFile:
line = line.splitlines()[0]
if ( line.find('\xe2\x80\x8c') != -1):
outputFile.write( "* [[" + line + "]], [[" + line.replace('\xe2\x80\x8c','') + "]]\n")
outputFile.write("ZWJ =====================\n")
for line in sourceFile:
line = line.splitlines()[0]
if ( line.find('\xe2\x80\x8d') != -1):
outputFile.write( "* [[" + line + "]], [[" + line.replace('\xe2\x80\x8d','') + "]]\n")
outputFile.write("ZWSP =====================\n")
for line in sourceFile:
line = line.splitlines()[0]
if ( line.find('\xe2\x80\x8b') != -1):
outputFile.write( "* [[" + line + "]], [[" + line.replace('\xe2\x80\x8b','') + "]]\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment