Skip to content

Instantly share code, notes, and snippets.

@hdlim15
Last active November 23, 2016 13:44
Show Gist options
  • Save hdlim15/2419dd8f6bb3987f4a3536a4008b0544 to your computer and use it in GitHub Desktop.
Save hdlim15/2419dd8f6bb3987f4a3536a4008b0544 to your computer and use it in GitHub Desktop.
import re
# Tests a re pattern against several tweets and sees how accurately it removes twitter handles
def test_re(pattern):
# An array of tweets to test
test_array = [
# Simple example. Handles with just numbers should be allowed
"@twitter hello @twi_tter_. hi @12345 @123news",
# Handles are allowed to follow any of the following characters
"@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n",
# Handles are NOT allowed to follow any of the following characters
"a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n",
# Handles are allowed to precede the following characters
"@n!a @n#a @n$a @n%a @n&a @n*a",
# Tests interactions with special symbols and multiple @
"@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n",
# Tests that handles can have a max length of 20
"@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle",
# Edge case where an @ comes directly after a long handle
"@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde"
]
# An array of the above tweets with the handles removed, as experimented on twitter
solution_array = [
" hello . hi ",
"`~()-=+\\|[]{};:'\"/?.,<> \n",
"a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n",
"!a #a $a %a &a *a",
"!@n #@n $@n %@n &@n *@n @n@n @@n @n@@n @n_@n @n7@n @nj@n",
"uvwxyz 1234 _ endofhandle",
"u@abcde @abcdefghijklmnopqrst@abcde _@abcde 5@abcde"
]
passed_all_tests = True
failed_tests = "Failed tests: "
for i in range(0, len(test_array)):
test = test_array[i]
solution = solution_array[i]
removed = pattern.sub("", test)
if removed == solution:
print("Passed test " + str(i))
else:
passed_all_tests = False
failed_tests += str(i) + ", "
print("\nFailed test " + str(i))
print("Your Solution: " + removed)
print("Correct Solution: " + solution)
if passed_all_tests:
print("All tests passed")
else:
print(failed_tests[:len(failed_tests)-2])
# Fails tests 0, 1, 3, 4, 5, 6
nltk_pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+")
print("Current remove_handles re: ")
test_re(nltk_pattern)
print()
# Passes all tests
my_pattern = re.compile(r"(?<![A-Za-z0-9_!@#\$%&*])@((([A-Za-z0-9_]){20}(?!@))|(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@))")
print("My new suggested re: ")
test_re(my_pattern)
@hdlim15
Copy link
Author

hdlim15 commented Nov 23, 2016

OUTPUT:

Current remove_handles re: 

Failed test 0
Your    Solution:  hello . hi @12345 @123news
Correct Solution:  hello . hi  

Failed test 1
Your    Solution: @n`@n~@n(@n)@n-@n=@n+@n\@n|@n[@n]@n{@n}@n;@n:@n'@n"@n/@n?@n.@n,@n<@n>@n @n
@n
Correct Solution: `~()-=+\|[]{};:'"/?.,<> 

Passed test 2

Failed test 3
Your    Solution: @n!a @n#a @n$a @n%a @n&a @n*a
Correct Solution: !a #a $a %a &a *a

Failed test 4
Your    Solution: @n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n @n @n
Correct Solution: !@n #@n $@n %@n &@n *@n @n@n @@n @n@@n @n_@n @n7@n @nj@n

Failed test 5
Your    Solution:    
Correct Solution: uvwxyz 1234 _ endofhandle

Failed test 6
Your    Solution: @abcde @abcde @abcde @abcde
Correct Solution: u@abcde @abcdefghijklmnopqrst@abcde _@abcde 5@abcde
Failed tests: 0, 1, 3, 4, 5, 6

My new suggested re: 
Passed test 0
Passed test 1
Passed test 2
Passed test 3
Passed test 4
Passed test 5
Passed test 6
All tests passed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment