Skip to content

Instantly share code, notes, and snippets.

@mwarzynski
Last active June 22, 2017 00:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mwarzynski/3f9704a7b24f575fb83d89093611a1a3 to your computer and use it in GitHub Desktop.
Save mwarzynski/3f9704a7b24f575fb83d89093611a1a3 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3.5
# -*- coding: utf-8 -*-
# keywords: unwrap lines, OCR, pdftotext
import sys
import os
import re
def txt_unwrap_lines():
conversions = [
# trim space and \12 to just space
['( )(?=[a-zżźćńółęąś „\(\[])', ' '],
# delete new line characters if next row starts from small letter
['(\n){1,2}(?=[a-zżźćńółęąś „\(\[])', ' '],
# delete - from '-.' case
['(-)(?=\.)', '']
]
if len(sys.argv) < 3:
raise ValueError("Example: txt_unwrap_lines input.txt output.txt")
input_filename = sys.argv[1]
if not os.path.isfile(input_filename):
raise ValueError("Input file does not exist.")
with open(input_filename, 'r') as f:
changed = f.read()
for c in conversions:
changed = re.sub(c[0], c[1], changed)
with open(sys.argv[2], 'w') as f:
f.write(changed)
if __name__ == "__main__":
txt_unwrap_lines()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment