Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
FORK: Python version of Ruby script to preprocess tweets for use in GloVe featurization http://nlp.stanford.edu/projects/glove/.
"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""
import sys
import regex as re
FLAGS = re.MULTILINE | re.DOTALL
def hashtag(text):
text = text.group()
hashtag_body = text[1:]
if hashtag_body.isupper():
result = " {} ".format(hashtag_body.lower())
else:
result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
return result
def allcaps(text):
text = text.group()
return text.lower() + " <allcaps>"
def tokenize(text):
# Different regex parts for smiley faces
eyes = r"[8:=;]"
nose = r"['`\-]?"
# function so code less repetitive
def re_sub(pattern, repl):
return re.sub(pattern, repl, text, flags=FLAGS)
text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
text = re_sub(r"@\w+", "<user>")
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
text = re_sub(r"/"," / ")
text = re_sub(r"<3","<heart>")
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
text = re_sub(r"#\S+", hashtag)
text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
text = re_sub(r"([A-Z]){2,}", allcaps)
return text.lower()
if __name__ == '__main__':
_, text = sys.argv
if text == "test":
text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
tokens = tokenize(text)
print(tokens)
@ppope

This comment has been minimized.

Copy link
Owner Author

ppope commented Feb 20, 2018

@ppope

This comment has been minimized.

Copy link
Owner Author

ppope commented Feb 20, 2018

Edited original to accommodate comments in original thread.

@ratihpp

This comment has been minimized.

Copy link

ratihpp commented Sep 10, 2019

thank you so much! this works beautifully on python3 👍

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.