Created
February 14, 2023 08:44
-
-
Save tos-kamiya/1eb95340c28342fc1fdc1967fbe5226f to your computer and use it in GitHub Desktop.
Search and mark strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from typing import Iterator, List, Optional, Tuple | |
import re | |
import sys | |
CIRCLED_NUMBERS = [ | |
# 0 | |
'\u24ea', | |
# 1 .. 20 | |
'\u2460', '\u2461', '\u2462', '\u2463', '\u2464', | |
'\u2465', '\u2466', '\u2467', '\u2468', '\u2469', | |
'\u246a', '\u246b', '\u246c', '\u246d', '\u246e', | |
'\u246f', '\u2470', '\u2471', '\u2472', '\u2473', | |
# 21 .. 35 | |
'\u3251', '\u3252', '\u3253', '\u3254', '\u3255', | |
'\u3256', '\u3257', '\u3258', '\u3259', '\u325a', | |
'\u325b', '\u325c', '\u325d', '\u325e', '\u325f', | |
# 36..50 | |
'\u32b1', '\u32b2', '\u32b3', '\u32b4', '\u32b5', | |
'\u32b6', '\u32b7', '\u32b8', '\u32b9', '\u32ba', | |
'\u32bb', '\u32bc', '\u32bd', '\u32be', '\u32bf', | |
] | |
def digit_str(i: int) -> str: | |
assert i >= 0 | |
if i < len(CIRCLED_NUMBERS): | |
return CIRCLED_NUMBERS[i] | |
else: | |
return '(%d)' % i | |
def number_strings(strings: List[str]) -> List[Tuple[str, int]]: | |
r = [] | |
string_set = set() | |
for i, s in enumerate(strings): | |
if s not in string_set: | |
r.append((s, i + 1)) | |
string_set.add(s) | |
return r | |
def search_iter(lines_it: Iterator[str], query_strings_w_number: List[Tuple[str, int]]) -> Iterator[Tuple[Optional[int], str]]: | |
qs_to_n = dict((s, n) for s, n in query_strings_w_number) | |
qss = list(qs_to_n.keys()) | |
pat_str = '|'.join(re.escape(qs) for qs in qss) | |
pat = re.compile(pat_str) | |
for L in lines_it: | |
L = L.rstrip() | |
p = 0 | |
for m in pat.finditer(L): | |
if p < m.start(): | |
yield None, L[p:m.start()] | |
p = m.start() | |
qs = m.group(0) | |
yield qs_to_n[qs], qs | |
p = m.end() | |
yield None, L[p:] | |
yield None, '\n' | |
def search_and_mark(lines_it: Iterator[str], query_strings_w_number: List[Tuple[str, int]], marker_ansi: str, | |
add_number: bool = False, number_circled: bool = False) -> None: | |
import colorama | |
from colorama import Back, Style | |
colorama.init() | |
for n, s in search_iter(lines_it, query_strings_w_number): | |
if n is None: | |
print(s, end='') | |
else: | |
if add_number: | |
if number_circled: | |
print(marker_ansi + digit_str(n) + " " + s + Style.RESET_ALL, end='') | |
else: | |
print(marker_ansi + "(%d) " % n + s + Style.RESET_ALL, end='') | |
else: | |
print(marker_ansi + s + Style.RESET_ALL, end='') | |
def count_appearance(lines_it: Iterator[str], query_strings_w_number: List[Tuple[str, int]]) -> List[Tuple[str, int]]: | |
q2c = dict() | |
for n, s in search_iter(lines_it, query_strings_w_number): | |
if n is None: | |
continue # for n, s | |
if s not in q2c: | |
q2c[s] = 0 | |
q2c[s] += 1 | |
query_string_appearances = [] | |
for s, n in query_strings_w_number: | |
c = q2c.get(s, 0) | |
query_string_appearances.append((s, c)) | |
return query_string_appearances | |
def marker_ansi(color_name: str) -> str: | |
from colorama import Back | |
BACKGROUND_COLOR_ANSI = { | |
'black': Back.BLACK, | |
'red': Back.RED, | |
'green': Back.GREEN, | |
'yellow': Back.YELLOW, | |
'blue': Back.BLUE, | |
'magenta': Back.MAGENTA, | |
'cyan': Back.CYAN, | |
'white': Back.WHITE, | |
} | |
try: | |
return BACKGROUND_COLOR_ANSI[color_name] | |
except KeyError as e: | |
raise e | |
__doc__ = """Search and mark strings. | |
Usage: | |
{0} [-c COLOR] [-n|-N] [-s STRING|-f STRINGFILE]... <inputfile> | |
{0} -a [-s STRING|-f STRINGFILE]... <inputfile> | |
Options: | |
-c COLOR Marker color (black, read, green, yellow, blue, magenta, cyan, white). | |
-s STRING Query string. | |
-f STRINGFILE Read query strings from the file. | |
-n, --add-number Sequentially number the strings and show each occurrence with the number. | |
-N, --add-circled-number Same as --add-number, but numbers are shown in circled digits. | |
-a, --count-appearances Show how many times each query string appears in the input file. | |
""".format('markthem') | |
def main(): | |
from docopt import docopt | |
args = docopt(__doc__) | |
query_strings = args['-s'] | |
string_files = args['-f'] | |
input_file = args['<inputfile>'] | |
opt_add_number = args['--add-number'] | |
opt_add_circled_number = args['--add-circled-number'] | |
opt_count_appearance = args['--count-appearances'] | |
opt_marker_color = args['-c'] | |
if opt_add_circled_number: | |
opt_add_number = True | |
opt_marker_ansi = marker_ansi('cyan') | |
if opt_marker_color: | |
try: | |
opt_marker_ansi = marker_ansi(opt_marker_color) | |
except KeyError: | |
sys.exit("Error: invalid color name for option -c: %s" % opt_marker_color) | |
s = set() | |
for f in string_files: | |
if f in s: | |
sys.exit("Error: duplicated string file: %s" % repr(f)) | |
s.add(f) | |
if '-' in string_files and input_file == '-': | |
sys.exit("Error: the standard input could be either a string file or the input file, not both.") | |
for f in string_files: | |
if f == '-': | |
lines = sys.stdin.readlines() | |
else: | |
with open(f) as inp: | |
lines = inp.readlines() | |
words = [L.rstrip() for L in lines] | |
words = [w for w in words if w] | |
query_strings.extend(words) | |
query_strings_w_number = number_strings(query_strings) | |
if opt_count_appearance: | |
if input_file == '-': | |
qsa = count_appearance(sys.stdin, query_strings_w_number) | |
else: | |
with open(input_file) as inp: | |
qsa = count_appearance(inp, query_strings_w_number) | |
for (q1, n), (q2, c) in zip(query_strings_w_number, qsa): | |
assert q1 == q2 | |
print("%d %d %s" % (c, n, q1)) | |
else: | |
if input_file == '-': | |
search_and_mark(sys.stdin, query_strings_w_number, opt_marker_ansi, add_number=opt_add_number, number_circled=opt_add_circled_number) | |
else: | |
with open(input_file) as inp: | |
search_and_mark(inp, query_strings_w_number, opt_marker_ansi, add_number=opt_add_number, number_circled=opt_add_circled_number) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment