Skip to content

Instantly share code, notes, and snippets.

@tabVersion
Last active August 26, 2020 14:25
Show Gist options
  • Save tabVersion/45ce3c24acd18098497b4293203deebf to your computer and use it in GitHub Desktop.
Save tabVersion/45ce3c24acd18098497b4293203deebf to your computer and use it in GitHub Desktop.
# requirements: Use regular expressions to retrieve sentences containing specific
# strings in the text, and align the output.
# Find out the sentences that contain "because...so" in the text, and try to align
# the output with two words as the center. Because...so the middle is all output,
# output 2 bytes or more before and after. If there are multiple "because" for a sentence,
# output them separately. Note that each cause can only be used once.
import re
def find(string: str):
greedy = re.compile(r'.?.?因为.*所以..')
non_greedy = re.compile(r'.?.?因为.*?所以..')
greedy_res = greedy.findall(string)
for item in greedy_res:
if greedy.search(item[item.find('因为')+2:-4]) is not None:
greedy_res.extend(find(item[item.find('因为')+2:-4]))
non_greedy_res = non_greedy.findall(string)
for item in non_greedy_res:
if non_greedy.search(item[item.find('因为')+2:-4]) is not None:
non_greedy_res.extend(find(item[item.find('因为')+2:-4]))
if len(greedy_res) > len(non_greedy_res):
return greedy_res
else:
return non_greedy_res
if __name__ == "__main__":
with open('corpus(1).txt') as f:
lines = f.readlines()
for idx, lines in enumerate(lines):
for item in find(lines):
pos = item.find('因为')
print(f'{idx+1}\t{"" if pos == 0 else item[:2]}\t*因为*\t{item[pos + 2: -4]}\t&所以&\t{item[-2:]}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment