Skip to content

Instantly share code, notes, and snippets.

@tos-kamiya
Created February 14, 2023 08:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tos-kamiya/1eb95340c28342fc1fdc1967fbe5226f to your computer and use it in GitHub Desktop.
Save tos-kamiya/1eb95340c28342fc1fdc1967fbe5226f to your computer and use it in GitHub Desktop.
Search and mark strings
#!/usr/bin/env python3
from typing import Iterator, List, Optional, Tuple
import re
import sys
CIRCLED_NUMBERS = [
# 0
'\u24ea',
# 1 .. 20
'\u2460', '\u2461', '\u2462', '\u2463', '\u2464',
'\u2465', '\u2466', '\u2467', '\u2468', '\u2469',
'\u246a', '\u246b', '\u246c', '\u246d', '\u246e',
'\u246f', '\u2470', '\u2471', '\u2472', '\u2473',
# 21 .. 35
'\u3251', '\u3252', '\u3253', '\u3254', '\u3255',
'\u3256', '\u3257', '\u3258', '\u3259', '\u325a',
'\u325b', '\u325c', '\u325d', '\u325e', '\u325f',
# 36..50
'\u32b1', '\u32b2', '\u32b3', '\u32b4', '\u32b5',
'\u32b6', '\u32b7', '\u32b8', '\u32b9', '\u32ba',
'\u32bb', '\u32bc', '\u32bd', '\u32be', '\u32bf',
]
def digit_str(i: int) -> str:
assert i >= 0
if i < len(CIRCLED_NUMBERS):
return CIRCLED_NUMBERS[i]
else:
return '(%d)' % i
def number_strings(strings: List[str]) -> List[Tuple[str, int]]:
r = []
string_set = set()
for i, s in enumerate(strings):
if s not in string_set:
r.append((s, i + 1))
string_set.add(s)
return r
def search_iter(lines_it: Iterator[str], query_strings_w_number: List[Tuple[str, int]]) -> Iterator[Tuple[Optional[int], str]]:
qs_to_n = dict((s, n) for s, n in query_strings_w_number)
qss = list(qs_to_n.keys())
pat_str = '|'.join(re.escape(qs) for qs in qss)
pat = re.compile(pat_str)
for L in lines_it:
L = L.rstrip()
p = 0
for m in pat.finditer(L):
if p < m.start():
yield None, L[p:m.start()]
p = m.start()
qs = m.group(0)
yield qs_to_n[qs], qs
p = m.end()
yield None, L[p:]
yield None, '\n'
def search_and_mark(lines_it: Iterator[str], query_strings_w_number: List[Tuple[str, int]], marker_ansi: str,
add_number: bool = False, number_circled: bool = False) -> None:
import colorama
from colorama import Back, Style
colorama.init()
for n, s in search_iter(lines_it, query_strings_w_number):
if n is None:
print(s, end='')
else:
if add_number:
if number_circled:
print(marker_ansi + digit_str(n) + " " + s + Style.RESET_ALL, end='')
else:
print(marker_ansi + "(%d) " % n + s + Style.RESET_ALL, end='')
else:
print(marker_ansi + s + Style.RESET_ALL, end='')
def count_appearance(lines_it: Iterator[str], query_strings_w_number: List[Tuple[str, int]]) -> List[Tuple[str, int]]:
q2c = dict()
for n, s in search_iter(lines_it, query_strings_w_number):
if n is None:
continue # for n, s
if s not in q2c:
q2c[s] = 0
q2c[s] += 1
query_string_appearances = []
for s, n in query_strings_w_number:
c = q2c.get(s, 0)
query_string_appearances.append((s, c))
return query_string_appearances
def marker_ansi(color_name: str) -> str:
from colorama import Back
BACKGROUND_COLOR_ANSI = {
'black': Back.BLACK,
'red': Back.RED,
'green': Back.GREEN,
'yellow': Back.YELLOW,
'blue': Back.BLUE,
'magenta': Back.MAGENTA,
'cyan': Back.CYAN,
'white': Back.WHITE,
}
try:
return BACKGROUND_COLOR_ANSI[color_name]
except KeyError as e:
raise e
__doc__ = """Search and mark strings.
Usage:
{0} [-c COLOR] [-n|-N] [-s STRING|-f STRINGFILE]... <inputfile>
{0} -a [-s STRING|-f STRINGFILE]... <inputfile>
Options:
-c COLOR Marker color (black, read, green, yellow, blue, magenta, cyan, white).
-s STRING Query string.
-f STRINGFILE Read query strings from the file.
-n, --add-number Sequentially number the strings and show each occurrence with the number.
-N, --add-circled-number Same as --add-number, but numbers are shown in circled digits.
-a, --count-appearances Show how many times each query string appears in the input file.
""".format('markthem')
def main():
from docopt import docopt
args = docopt(__doc__)
query_strings = args['-s']
string_files = args['-f']
input_file = args['<inputfile>']
opt_add_number = args['--add-number']
opt_add_circled_number = args['--add-circled-number']
opt_count_appearance = args['--count-appearances']
opt_marker_color = args['-c']
if opt_add_circled_number:
opt_add_number = True
opt_marker_ansi = marker_ansi('cyan')
if opt_marker_color:
try:
opt_marker_ansi = marker_ansi(opt_marker_color)
except KeyError:
sys.exit("Error: invalid color name for option -c: %s" % opt_marker_color)
s = set()
for f in string_files:
if f in s:
sys.exit("Error: duplicated string file: %s" % repr(f))
s.add(f)
if '-' in string_files and input_file == '-':
sys.exit("Error: the standard input could be either a string file or the input file, not both.")
for f in string_files:
if f == '-':
lines = sys.stdin.readlines()
else:
with open(f) as inp:
lines = inp.readlines()
words = [L.rstrip() for L in lines]
words = [w for w in words if w]
query_strings.extend(words)
query_strings_w_number = number_strings(query_strings)
if opt_count_appearance:
if input_file == '-':
qsa = count_appearance(sys.stdin, query_strings_w_number)
else:
with open(input_file) as inp:
qsa = count_appearance(inp, query_strings_w_number)
for (q1, n), (q2, c) in zip(query_strings_w_number, qsa):
assert q1 == q2
print("%d %d %s" % (c, n, q1))
else:
if input_file == '-':
search_and_mark(sys.stdin, query_strings_w_number, opt_marker_ansi, add_number=opt_add_number, number_circled=opt_add_circled_number)
else:
with open(input_file) as inp:
search_and_mark(inp, query_strings_w_number, opt_marker_ansi, add_number=opt_add_number, number_circled=opt_add_circled_number)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment