kra3/wiki_helper.py

## wiki_helper.py
# -*- coding: utf-8 -*-

__author__ = 'Arun KR (kra3)  <the1.arun@gmail.com>'
__license__ = 'Simplified BSD'

import sys

"""
wiki_helper.py rules.txt data.txt > result.txt

rules.txt could be any text file with one rule per line.
A Rule will be of format CODE :=> STRING`

CODE will be one of DL or RM.
DL for delete matching line - PS: matching starts from begining affter removing whitespaces.
RM for remove matching strings from a line.

You have to redirect output to a file or another unix command for further processing.
Utility developed for malayalam wikibooks maintainers.
"""

def wiki_helper(data, rules):
  # list of rules
  delete_line_matches = []
  remove_string_matches = []

  # Open files
  data_fh = open(data)
  rules_fh = open(rules)

  ## extracting user defined rules
  for line in rules_fh.readlines():
    rule = line.split(':=>')

    if len(rule) != 2:  # safegaurd against malformed rules.
      continue

    code, expr = map(str.strip, rule)  # strip down her out of whitespaces

    # circus to put rules at their places.
    if code == 'DL':
      delete_line_matches.append(expr)
    elif code == 'RM':
      remove_string_matches.append(expr)
    else:
      pass

  ## processing data with rules
  for line in data_fh.readlines():
    matched = False  # sentinel

    # loop until a match for delete line is found,
    # set sentinel and be out as fast as you can.
    for match in delete_line_matches:
      if line.strip().startswith(match):
	matched = True
	break

    # remove all those junk to become a slim beauty.
    if not matched:
      for expr in remove_string_matches:
	line = line.replace(expr, '')

      # Now, Go; take on the world...
      print line,


if __name__ == '__main__':
  if not len(sys.argv) == 3:
    print "Incorrect format. Try:"
    print "\twiki_helper.py rules data"
    exit(1)

  wiki_helper(sys.argv[2], sys.argv[1])
	# -- coding: utf-8 --

	__author__ = 'Arun KR (kra3) <the1.arun@gmail.com>'
	__license__ = 'Simplified BSD'

	import sys

	"""
	wiki_helper.py rules.txt data.txt > result.txt

	rules.txt could be any text file with one rule per line.
	A Rule will be of format CODE :=> STRING`

	CODE will be one of DL or RM.
	DL for delete matching line - PS: matching starts from begining affter removing whitespaces.
	RM for remove matching strings from a line.

	You have to redirect output to a file or another unix command for further processing.
	Utility developed for malayalam wikibooks maintainers.
	"""

	def wiki_helper(data, rules):
	# list of rules
	delete_line_matches = []
	remove_string_matches = []

	# Open files
	data_fh = open(data)
	rules_fh = open(rules)

	## extracting user defined rules
	for line in rules_fh.readlines():
	rule = line.split(':=>')

	if len(rule) != 2: # safegaurd against malformed rules.
	continue

	code, expr = map(str.strip, rule) # strip down her out of whitespaces

	# circus to put rules at their places.
	if code == 'DL':
	delete_line_matches.append(expr)
	elif code == 'RM':
	remove_string_matches.append(expr)
	else:
	pass

	## processing data with rules
	for line in data_fh.readlines():
	matched = False # sentinel

	# loop until a match for delete line is found,
	# set sentinel and be out as fast as you can.
	for match in delete_line_matches:
	if line.strip().startswith(match):
	matched = True
	break

	# remove all those junk to become a slim beauty.
	if not matched:
	for expr in remove_string_matches:
	line = line.replace(expr, '')

	# Now, Go; take on the world...
	print line,


	if __name__ == '__main__':
	if not len(sys.argv) == 3:
	print "Incorrect format. Try:"
	print "\twiki_helper.py rules data"
	exit(1)

	wiki_helper(sys.argv[2], sys.argv[1])