Created
August 12, 2022 16:07
-
-
Save secoats/81fce5f24f6a787a785331ccb0faae3e to your computer and use it in GitHub Desktop.
Python3 multiple word match in big texts with Aho-Corasick
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# pip3 install pyahocorasick | |
# Matching time: Θ(n + o) with n being length of the haystack, o being the number of occurrences | |
import ahocorasick | |
wordlist = [ | |
"password", | |
"secret", | |
"privatekey", | |
"private_key", | |
"BEGIN PRIVATE KEY", | |
"BEGIN RSA PRIVATE KEY" | |
] | |
haystack = "<VERY LONG STRING THAT YOU WANT TO SEARCH>" | |
automaton = ahocorasick.Automaton() | |
for idx, word in enumerate(wordlist): | |
testphrase = word.strip() | |
automaton.add_word(testphrase, testphrase) | |
automaton.make_automaton() | |
matches = automaton.iter(haystack) | |
for result in matches: | |
print(result) | |
endindex, word = result | |
# Example output (end index, found word): | |
""" | |
(342, 'password') | |
(28482, 'BEGIN RSA PRIVATE KEY') | |
(342120, 'password') | |
(367544, 'secret') | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment