Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active December 20, 2015 08:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreasvc/6103307 to your computer and use it in GitHub Desktop.
Save andreasvc/6103307 to your computer and use it in GitHub Desktop.
Match lines in one file with those of another, and produce line numbers.
""" Match lines in one file with those of another,
and produce line numbers. """
import io
import sys
USAGE = """Match lines in one file with those of another, and get line numbers.
usage: python %s sents text output
where sents and text are files with one sentence per line.
The result will be of the form "1|line", written to file "output".
Everything is assumed to be encoded with UTF-8.""" % sys.argv[0]
def mangle(line):
""" Strip spaces, capitalization & special characters for matching. """
return line.replace(' ', '').lower().encode('ascii', 'ignore')
def findsentnums(sentsfile, bookfile, outfile):
""" Go through lines of book and report line numbers of lines in sents. """
sents = {mangle(line): ('XXX', line) for line in io.open(
sentsfile, encoding='utf8').read().split('\n') if line.strip()}
book = {mangle(line): (n + 1, line) for n, line in enumerate(io.open(
bookfile, encoding='utf8').read().split('\n'))}
matches = [book[mangled] for mangled in set(sents) & set(book)]
unmatched = [sents[mangled] for mangled in set(sents) - set(book)]
with io.open(outfile, 'w', encoding='utf8') as out:
out.writelines('%s|%s\n' % nl for nl in sorted(matches + unmatched))
def main():
""" Command line interface. """
try:
findsentnums(*sys.argv[1:])
except TypeError:
print(USAGE)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment