mwarzynski/txt_unwrap_lines.py

## txt_unwrap_lines.py
#!/usr/bin/python3.5
# -*- coding: utf-8 -*-

# keywords: unwrap lines, OCR, pdftotext

import sys
import os
import re

def txt_unwrap_lines():

    conversions = [
    	# trim space and \12 to just space
        ['( )(?=[a-zżźćńółęąś„\(\[])', ' '],
        # delete new line characters if next row starts from small letter
        ['(\n){1,2}(?=[a-zżźćńółęąś„\(\[])', ' '],
        # delete - from '-.' case
        ['(-)(?=\.)', '']
    ]

    if len(sys.argv) < 3:
        raise ValueError("Example: txt_unwrap_lines input.txt output.txt")

    input_filename = sys.argv[1]

    if not os.path.isfile(input_filename):
        raise ValueError("Input file does not exist.")

    with open(input_filename, 'r') as f:
        changed = f.read()

    for c in conversions:
    	changed = re.sub(c[0], c[1], changed)

    with open(sys.argv[2], 'w') as f:
        f.write(changed)

if __name__ == "__main__":
    txt_unwrap_lines()
	#!/usr/bin/python3.5
	# -- coding: utf-8 --

	# keywords: unwrap lines, OCR, pdftotext

	import sys
	import os
	import re

	def txt_unwrap_lines():

	conversions = [
	# trim space and \12 to just space
	['( )(?=[a-zżźćńółęąś„\(\[])', ' '],
	# delete new line characters if next row starts from small letter
	['(\n){1,2}(?=[a-zżźćńółęąś„\(\[])', ' '],
	# delete - from '-.' case
	['(-)(?=\.)', '']
	]

	if len(sys.argv) < 3:
	raise ValueError("Example: txt_unwrap_lines input.txt output.txt")

	input_filename = sys.argv[1]

	if not os.path.isfile(input_filename):
	raise ValueError("Input file does not exist.")

	with open(input_filename, 'r') as f:
	changed = f.read()

	for c in conversions:
	changed = re.sub(c[0], c[1], changed)

	with open(sys.argv[2], 'w') as f:
	f.write(changed)

	if __name__ == "__main__":
	txt_unwrap_lines()