Skip to content

Instantly share code, notes, and snippets.

@meyt
Last active July 25, 2018 08:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meyt/4c838f3d47cb56ea57523c57eac4225a to your computer and use it in GitHub Desktop.
Save meyt/4c838f3d47cb56ea57523c57eac4225a to your computer and use it in GitHub Desktop.
Extract hashtags from string (python 3.x)
import re
import unittest
def extract_tags(text: str) -> list:
return re.compile(r"(?:^|\W)[##](?!\d\d)(?!\d$)(\w+)", re.UNICODE).findall(text)
class ExtractTagsTestCase(unittest.TestCase):
def test_extract_tags(self):
# start with tag
self.assertEqual(
extract_tags('#lorem ipsum\n\n'),
['lorem']
)
# have more tags
self.assertEqual(
extract_tags('#Lorem #ipsum dolor sit amet,\n\n'),
['Lorem', 'ipsum']
)
# with unicode
self.assertEqual(
extract_tags('#Lorem #ایپسوم dolor sit amet,\n\n'),
['Lorem', 'ایپسوم']
)
# with underscore
self.assertEqual(
extract_tags('#Lorem_a #ایپسوم_b dolor sit amet,\n\n'),
['Lorem_a', 'ایپسوم_b']
)
self.assertEqual(
extract_tags('#Lorem_a #ipsum_a_and_b dolor sit amet,\n\n'),
['Lorem_a', 'ipsum_a_and_b']
)
# no special chars
self.assertEqual(
extract_tags('#Lorem #@ipsum dolor sit amet,\n\n'),
['Lorem']
)
# no punctuation
self.assertEqual(
extract_tags('#Lorem #2nd_ipsu\'m dolor sit amet,\n\n'),
['Lorem', '2nd_ipsu']
)
# tags should not fully match with digits
self.assertEqual(
extract_tags('#Lorem #2015 dolor sit amet,\n\n'),
['Lorem']
)
# tags can start with digit but should contain characters on next
self.assertEqual(
extract_tags('#Lorem #2nd_ipsum dolor sit amet,\n\n'),
['Lorem', '2nd_ipsum']
)
self.assertEqual(
extract_tags('#Lorem #2nd_ipsum dolor sit amet,\n\n'),
['Lorem', '2nd_ipsum']
)
# tags should separate with spaces
self.assertEqual(
extract_tags('#Lorem#2_ipsum dolor sit amet,\n\n'),
['Lorem']
)
# next lines
self.assertEqual(
extract_tags('#Lorem#2_ipsum dolor sit amet,\n\n #next #line'),
['Lorem', 'next', 'line']
)
# Ignore brackets
self.assertEqual(
extract_tags('<#Lorem>(#2_ipsum) dolor sit amet,\n\n "#next" <#line> «#تگ»'),
['Lorem', '2_ipsum', 'next', 'line', 'تگ']
)
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment